Update aosp/master LLVM for rebase to r230699. Change-Id: I2b5be30509658cb8266be782de0ab24f9099f9b9

commit: ebe69fe11e48d322045d5949c83283927a0d790b [log] [tgz]
author: Stephen Hines <srhines@google.com> Mon Mar 23 12:10:34 2015 -0700
committer: Stephen Hines <srhines@google.com> Mon Mar 23 12:10:34 2015 -0700
tree: c92f1907a6b8006628a4b01615f38264d29834ea
parent: b7d2e72b02a4cb8034f32f8247a2558d2434e121 [diff]
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 91c1314..58b87e1 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp

@@ -296,6 +296,16 @@
   std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
     RegRefs = State->GetRegRefs();
 
+  // FIXME: We must leave subregisters of live super registers as live, so that
+  // we don't clear out the register tracking information for subregisters of
+  // super registers we're still tracking (and with which we're unioning
+  // subregister definitions).
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    if (TRI->isSuperRegister(Reg, *AI) && State->IsLive(*AI)) {
+      DEBUG(if (!header && footer) dbgs() << footer);
+      return;
+    }
+
   if (!State->IsLive(Reg)) {
     KillIndices[Reg] = KillIdx;
     DefIndices[Reg] = ~0u;
@@ -673,6 +683,21 @@
           goto next_super_reg;
       }
 
+      // We cannot rename 'Reg' to 'NewReg' if one of the uses of 'Reg' also
+      // defines 'NewReg' via an early-clobber operand.
+      auto Range = RegRefs.equal_range(Reg);
+      for (auto Q = Range.first, QE = Range.second; Q != QE; ++Q) {
+        auto UseMI = Q->second.Operand->getParent();
+        int Idx = UseMI->findRegisterDefOperandIdx(NewReg, false, true, TRI);
+        if (Idx == -1)
+          continue;
+
+        if (UseMI->getOperand(Idx).isEarlyClobber()) {
+          DEBUG(dbgs() << "(ec)");
+          goto next_super_reg;
+        }
+      }
+
       // Record that 'Reg' can be renamed to 'NewReg'.
       RenameMap.insert(std::pair<unsigned, unsigned>(Reg, NewReg));
     }

diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 9a3b790..e50b846 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp

@@ -30,10 +30,9 @@
 
 using namespace llvm;
 
-/// ComputeLinearIndex - Given an LLVM IR aggregate type and a sequence
-/// of insertvalue or extractvalue indices that identify a member, return
-/// the linearized index of the start of the member.
-///
+/// Compute the linearized index of a member in a nested aggregate/struct/array
+/// by recursing and accumulating CurIndex as long as there are indices in the
+/// index list.
 unsigned llvm::ComputeLinearIndex(Type *Ty,
                                   const unsigned *Indices,
                                   const unsigned *IndicesEnd,
@@ -52,16 +51,23 @@
         return ComputeLinearIndex(*EI, Indices+1, IndicesEnd, CurIndex);
       CurIndex = ComputeLinearIndex(*EI, nullptr, nullptr, CurIndex);
     }
+    assert(!Indices && "Unexpected out of bound");
     return CurIndex;
   }
   // Given an array type, recursively traverse the elements.
   else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
     Type *EltTy = ATy->getElementType();
-    for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i) {
-      if (Indices && *Indices == i)
-        return ComputeLinearIndex(EltTy, Indices+1, IndicesEnd, CurIndex);
-      CurIndex = ComputeLinearIndex(EltTy, nullptr, nullptr, CurIndex);
+    unsigned NumElts = ATy->getNumElements();
+    // Compute the Linear offset when jumping one element of the array
+    unsigned EltLinearOffset = ComputeLinearIndex(EltTy, nullptr, nullptr, 0);
+    if (Indices) {
+      assert(*Indices < NumElts && "Unexpected out of bound");
+      // If the indice is inside the array, compute the index to the requested
+      // elt and recurse inside the element with the end of the indices list
+      CurIndex += EltLinearOffset* *Indices;
+      return ComputeLinearIndex(EltTy, Indices+1, IndicesEnd, CurIndex);
     }
+    CurIndex += EltLinearOffset*NumElts;
     return CurIndex;
   }
   // We haven't found the type we're looking for, so keep searching.
@@ -512,8 +518,9 @@
         return false;
     }
 
+  const Function *F = ExitBB->getParent();
   return returnTypeIsEligibleForTailCall(
-      ExitBB->getParent(), I, Ret, *TM.getSubtargetImpl()->getTargetLowering());
+      F, I, Ret, *TM.getSubtargetImpl(*F)->getTargetLowering());
 }
 
 bool llvm::returnTypeIsEligibleForTailCall(const Function *F,

diff --git a/lib/CodeGen/Android.mk b/lib/CodeGen/Android.mk
index 5cb351d..ec3cd77 100644
--- a/lib/CodeGen/Android.mk
+++ b/lib/CodeGen/Android.mk

@@ -24,6 +24,7 @@
   ForwardControlFlowIntegrity.cpp \
   GCMetadata.cpp \
   GCMetadataPrinter.cpp \
+  GCRootLowering.cpp \
   GCStrategy.cpp \
   GlobalMerge.cpp \
   IfConversion.cpp \
@@ -95,6 +96,7 @@
   ScheduleDAGPrinter.cpp \
   ScoreboardHazardRecognizer.cpp \
   ShadowStackGC.cpp \
+  ShadowStackGCLowering.cpp \
   SjLjEHPrepare.cpp \
   SlotIndexes.cpp \
   SpillPlacement.cpp \
@@ -104,6 +106,7 @@
   StackMaps.cpp \
   StackProtector.cpp \
   StackSlotColoring.cpp \
+  StatepointExampleGC.cpp \
   TailDuplication.cpp \
   TargetFrameLoweringImpl.cpp \
   TargetInstrInfo.cpp \
@@ -114,7 +117,8 @@
   TargetSchedule.cpp \
   TwoAddressInstructionPass.cpp \
   UnreachableBlockElim.cpp \
-  VirtRegMap.cpp
+  VirtRegMap.cpp \
+  WinEHPrepare.cpp
 
 # For the host
 # =====================================================

diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index 66c6c63..6fe75ad 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp

@@ -88,8 +88,7 @@
                                                   Asm->getFunctionNumber()));
     if (!MMI->getLandingPads().empty()) {
       // Emit references to personality.
-      if (const Function * Personality =
-          MMI->getPersonalities()[MMI->getPersonalityIndex()]) {
+      if (const Function *Personality = MMI->getPersonality()) {
         MCSymbol *PerSym = Asm->getSymbol(Personality);
         Asm->OutStreamer.EmitSymbolAttribute(PerSym, MCSA_Global);
         ATS.emitPersonality(PerSym);

diff --git a/lib/CodeGen/AsmPrinter/Android.mk b/lib/CodeGen/AsmPrinter/Android.mk
index cb8e96a..0ce457f 100644
--- a/lib/CodeGen/AsmPrinter/Android.mk
+++ b/lib/CodeGen/AsmPrinter/Android.mk

@@ -13,6 +13,7 @@
   DwarfCFIException.cpp \
   DwarfCompileUnit.cpp \
   DwarfDebug.cpp \
+  DwarfExpression.cpp \
   DwarfFile.cpp \
   DwarfStringPool.cpp \
   DwarfUnit.cpp \

diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 8a32713..988381d 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp

@@ -41,9 +41,11 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -98,15 +100,17 @@
   return NumBits;
 }
 
-AsmPrinter::AsmPrinter(TargetMachine &tm, MCStreamer &Streamer)
+AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr<MCStreamer> Streamer)
     : MachineFunctionPass(ID), TM(tm), MAI(tm.getMCAsmInfo()),
-      MII(tm.getSubtargetImpl()->getInstrInfo()),
-      OutContext(Streamer.getContext()), OutStreamer(Streamer), LastMI(nullptr),
-      LastFn(0), Counter(~0U), SetCounter(0) {
-  DD = nullptr; MMI = nullptr; LI = nullptr; MF = nullptr;
+      OutContext(Streamer->getContext()), OutStreamer(*Streamer.release()),
+      LastMI(nullptr), LastFn(0), Counter(~0U), SetCounter(0) {
+  DD = nullptr;
+  MMI = nullptr;
+  LI = nullptr;
+  MF = nullptr;
   CurrentFnSym = CurrentFnSymForSize = nullptr;
   GCMetadataPrinters = nullptr;
-  VerboseAsm = Streamer.isVerboseAsm();
+  VerboseAsm = OutStreamer.isVerboseAsm();
 }
 
 AsmPrinter::~AsmPrinter() {
@@ -129,16 +133,17 @@
 }
 
 const TargetLoweringObjectFile &AsmPrinter::getObjFileLowering() const {
-  return TM.getSubtargetImpl()->getTargetLowering()->getObjFileLowering();
+  return *TM.getObjFileLowering();
 }
 
 /// getDataLayout - Return information about data layout.
 const DataLayout &AsmPrinter::getDataLayout() const {
-  return *TM.getSubtargetImpl()->getDataLayout();
+  return *TM.getDataLayout();
 }
 
 const MCSubtargetInfo &AsmPrinter::getSubtargetInfo() const {
-  return TM.getSubtarget<MCSubtargetInfo>();
+  assert(MF && "getSubtargetInfo requires a valid MachineFunction!");
+  return MF->getSubtarget<MCSubtargetInfo>();
 }
 
 void AsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) {
@@ -175,7 +180,7 @@
 
   OutStreamer.InitSections(false);
 
-  Mang = new Mangler(TM.getSubtargetImpl()->getDataLayout());
+  Mang = new Mangler(TM.getDataLayout());
 
   // Emit the version-min deplyment target directive if needed.
   //
@@ -210,7 +215,7 @@
   assert(MI && "AsmPrinter didn't require GCModuleInfo?");
   for (auto &I : *MI)
     if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(*I))
-      MP->beginAssembly(*this);
+      MP->beginAssembly(M, *MI, *this);
 
   // Emit module-level inline asm if it exists.
   if (!M.getModuleInlineAsm().empty()) {
@@ -222,12 +227,25 @@
   }
 
   if (MAI->doesSupportDebugInformation()) {
-    if (Triple(TM.getTargetTriple()).isKnownWindowsMSVCEnvironment())
+    bool skip_dwarf = false;
+    if (Triple(TM.getTargetTriple()).isKnownWindowsMSVCEnvironment()) {
       Handlers.push_back(HandlerInfo(new WinCodeViewLineTables(this),
                                      DbgTimerName,
                                      CodeViewLineTablesGroupName));
-    DD = new DwarfDebug(this, &M);
-    Handlers.push_back(HandlerInfo(DD, DbgTimerName, DWARFGroupName));
+      // FIXME: Don't emit DWARF debug info if there's at least one function
+      // with AddressSanitizer instrumentation.
+      // This is a band-aid fix for PR22032.
+      for (auto &F : M.functions()) {
+        if (F.hasFnAttribute(Attribute::SanitizeAddress)) {
+          skip_dwarf = true;
+          break;
+        }
+      }
+    }
+    if (!skip_dwarf) {
+      DD = new DwarfDebug(this, &M);
+      Handlers.push_back(HandlerInfo(DD, DbgTimerName, DWARFGroupName));
+    }
   }
 
   EHStreamer *ES = nullptr;
@@ -241,7 +259,7 @@
   case ExceptionHandling::ARM:
     ES = new ARMException(this);
     break;
-  case ExceptionHandling::ItaniumWinEH:
+  case ExceptionHandling::WinEH:
     switch (MAI->getWinEHEncodingType()) {
     default: llvm_unreachable("unsupported unwinding information encoding");
     case WinEH::EncodingType::Itanium:
@@ -323,6 +341,11 @@
     if (EmitSpecialLLVMGlobal(GV))
       return;
 
+    // Skip the emission of global equivalents. The symbol can be emitted later
+    // on by emitGlobalGOTEquivs in case it turns out to be needed.
+    if (GlobalGOTEquivs.count(getSymbol(GV)))
+      return;
+
     if (isVerbose()) {
       GV->printAsOperand(OutStreamer.GetCommentOS(),
                      /*PrintType=*/false, GV->getParent());
@@ -336,12 +359,17 @@
   if (!GV->hasInitializer())   // External globals require no extra code.
     return;
 
+  GVSym->redefineIfPossible();
+  if (GVSym->isDefined() || GVSym->isVariable())
+    report_fatal_error("symbol '" + Twine(GVSym->getName()) +
+                       "' is already defined");
+
   if (MAI->hasDotTypeDotSizeDirective())
     OutStreamer.EmitSymbolAttribute(GVSym, MCSA_ELF_TypeObject);
 
   SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM);
 
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   uint64_t Size = DL->getTypeAllocSize(GV->getType()->getElementType());
 
   // If the alignment is specified, we *must* obey it.  Overaligning a global
@@ -508,6 +536,10 @@
     OutStreamer.GetCommentOS() << '\n';
   }
 
+  // Emit the prefix data.
+  if (F->hasPrefixData())
+    EmitGlobalConstant(F->getPrefixData());
+
   // Emit the CurrentFnSym.  This is a virtual function to allow targets to
   // do their wild and crazy things as required.
   EmitFunctionEntryLabel();
@@ -528,27 +560,32 @@
     HI.Handler->beginFunction(MF);
   }
 
-  // Emit the prefix data.
-  if (F->hasPrefixData())
-    EmitGlobalConstant(F->getPrefixData());
+  // Emit the prologue data.
+  if (F->hasPrologueData())
+    EmitGlobalConstant(F->getPrologueData());
 }
 
 /// EmitFunctionEntryLabel - Emit the label that is the entrypoint for the
 /// function.  This can be overridden by targets as required to do custom stuff.
 void AsmPrinter::EmitFunctionEntryLabel() {
+  CurrentFnSym->redefineIfPossible();
+
   // The function label could have already been emitted if two symbols end up
   // conflicting due to asm renaming.  Detect this and emit an error.
-  if (CurrentFnSym->isUndefined())
-    return OutStreamer.EmitLabel(CurrentFnSym);
+  if (CurrentFnSym->isVariable())
+    report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
+                       "' is a protected alias");
+  if (CurrentFnSym->isDefined())
+    report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
+                       "' label emitted multiple times to assembly file");
 
-  report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
-                     "' label emitted multiple times to assembly file");
+  return OutStreamer.EmitLabel(CurrentFnSym);
 }
 
 /// emitComments - Pretty-print comments for instructions.
 static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
   const MachineFunction *MF = MI.getParent()->getParent();
-  const TargetMachine &TM = MF->getTarget();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
 
   // Check for spills and reloads
   int FI;
@@ -558,24 +595,20 @@
   // We assume a single instruction only has a spill or reload, not
   // both.
   const MachineMemOperand *MMO;
-  if (TM.getSubtargetImpl()->getInstrInfo()->isLoadFromStackSlotPostFE(&MI,
-                                                                       FI)) {
+  if (TII->isLoadFromStackSlotPostFE(&MI, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI)) {
       MMO = *MI.memoperands_begin();
       CommentOS << MMO->getSize() << "-byte Reload\n";
     }
-  } else if (TM.getSubtargetImpl()->getInstrInfo()->hasLoadFromStackSlot(
-                 &MI, MMO, FI)) {
+  } else if (TII->hasLoadFromStackSlot(&MI, MMO, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI))
       CommentOS << MMO->getSize() << "-byte Folded Reload\n";
-  } else if (TM.getSubtargetImpl()->getInstrInfo()->isStoreToStackSlotPostFE(
-                 &MI, FI)) {
+  } else if (TII->isStoreToStackSlotPostFE(&MI, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI)) {
       MMO = *MI.memoperands_begin();
       CommentOS << MMO->getSize() << "-byte Spill\n";
     }
-  } else if (TM.getSubtargetImpl()->getInstrInfo()->hasStoreToStackSlot(
-                 &MI, MMO, FI)) {
+  } else if (TII->hasStoreToStackSlot(&MI, MMO, FI)) {
     if (FrameInfo->isSpillSlotObjectIndex(FI))
       CommentOS << MMO->getSize() << "-byte Folded Spill\n";
   }
@@ -589,9 +622,8 @@
 /// that is an implicit def.
 void AsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
   unsigned RegNo = MI->getOperand(0).getReg();
-  OutStreamer.AddComment(
-      Twine("implicit-def: ") +
-      TM.getSubtargetImpl()->getRegisterInfo()->getName(RegNo));
+  OutStreamer.AddComment(Twine("implicit-def: ") +
+                         MMI->getContext().getRegisterInfo()->getName(RegNo));
   OutStreamer.AddBlankLine();
 }
 
@@ -601,7 +633,7 @@
     const MachineOperand &Op = MI->getOperand(i);
     assert(Op.isReg() && "KILL instruction must have only register operands");
     Str += ' ';
-    Str += AP.TM.getSubtargetImpl()->getRegisterInfo()->getName(Op.getReg());
+    Str += AP.MMI->getContext().getRegisterInfo()->getName(Op.getReg());
     Str += (Op.isDef() ? "<def>" : "<kill>");
   }
   AP.OutStreamer.AddComment(Str);
@@ -629,9 +661,9 @@
   OS << V.getName();
 
   DIExpression Expr = MI->getDebugExpression();
-  if (Expr.isVariablePiece())
-    OS << " [piece offset=" << Expr.getPieceOffset()
-       << " size=" << Expr.getPieceSize() << "]";
+  if (Expr.isBitPiece())
+    OS << " [bit_piece offset=" << Expr.getBitPieceOffset()
+       << " size=" << Expr.getBitPieceSize() << "]";
   OS << " <- ";
 
   // The second operand is only an offset if it's an immediate.
@@ -663,8 +695,7 @@
       Reg = MI->getOperand(0).getReg();
     } else {
       assert(MI->getOperand(0).isFI() && "Unknown operand type");
-      const TargetFrameLowering *TFI =
-          AP.TM.getSubtargetImpl()->getFrameLowering();
+      const TargetFrameLowering *TFI = AP.MF->getSubtarget().getFrameLowering();
       Offset += TFI->getFrameIndexReference(*AP.MF,
                                             MI->getOperand(0).getIndex(), Reg);
       Deref = true;
@@ -678,7 +709,7 @@
     }
     if (Deref)
       OS << '[';
-    OS << AP.TM.getSubtargetImpl()->getRegisterInfo()->getName(Reg);
+    OS << AP.MMI->getContext().getRegisterInfo()->getName(Reg);
   }
 
   if (Deref)
@@ -701,8 +732,7 @@
 }
 
 bool AsmPrinter::needsSEHMoves() {
-  return MAI->getExceptionHandlingType() == ExceptionHandling::ItaniumWinEH &&
-         MF->getFunction()->needsUnwindTableEntry();
+  return MAI->usesWindowsCFI() && MF->getFunction()->needsUnwindTableEntry();
 }
 
 void AsmPrinter::emitCFIInstruction(const MachineInstr &MI) {
@@ -721,6 +751,16 @@
   emitCFIInstruction(CFI);
 }
 
+void AsmPrinter::emitFrameAlloc(const MachineInstr &MI) {
+  // The operands are the MCSymbol and the frame offset of the allocation.
+  MCSymbol *FrameAllocSym = MI.getOperand(0).getMCSymbol();
+  int FrameOffset = MI.getOperand(1).getImm();
+
+  // Emit a symbol assignment.
+  OutStreamer.EmitAssignment(FrameAllocSym,
+                             MCConstantExpr::Create(FrameOffset, OutContext));
+}
+
 /// EmitFunctionBody - This method emits the body and trailer for a
 /// function.
 void AsmPrinter::EmitFunctionBody() {
@@ -759,6 +799,10 @@
         emitCFIInstruction(MI);
         break;
 
+      case TargetOpcode::FRAME_ALLOC:
+        emitFrameAlloc(MI);
+        break;
+
       case TargetOpcode::EH_LABEL:
       case TargetOpcode::GC_LABEL:
         OutStreamer.EmitLabel(MI.getOperand(0).getMCSymbol());
@@ -800,7 +844,7 @@
   // labels from collapsing together.  Just emit a noop.
   if ((MAI->hasSubsectionsViaSymbols() && !HasAnyRealCode)) {
     MCInst Noop;
-    TM.getSubtargetImpl()->getInstrInfo()->getNoopForMachoTarget(Noop);
+    MF->getSubtarget().getInstrInfo()->getNoopForMachoTarget(Noop);
     OutStreamer.AddComment("avoids zero-length function");
 
     // Targets can opt-out of emitting the noop here by leaving the opcode
@@ -852,13 +896,95 @@
   OutStreamer.AddBlankLine();
 }
 
-static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP);
+/// \brief Compute the number of Global Variables that uses a Constant.
+static unsigned getNumGlobalVariableUses(const Constant *C) {
+  if (!C)
+    return 0;
+
+  if (isa<GlobalVariable>(C))
+    return 1;
+
+  unsigned NumUses = 0;
+  for (auto *CU : C->users())
+    NumUses += getNumGlobalVariableUses(dyn_cast<Constant>(CU));
+
+  return NumUses;
+}
+
+/// \brief Only consider global GOT equivalents if at least one user is a
+/// cstexpr inside an initializer of another global variables. Also, don't
+/// handle cstexpr inside instructions. During global variable emission,
+/// candidates are skipped and are emitted later in case at least one cstexpr
+/// isn't replaced by a PC relative GOT entry access.
+static bool isGOTEquivalentCandidate(const GlobalVariable *GV,
+                                     unsigned &NumGOTEquivUsers) {
+  // Global GOT equivalents are unnamed private globals with a constant
+  // pointer initializer to another global symbol. They must point to a
+  // GlobalVariable or Function, i.e., as GlobalValue.
+  if (!GV->hasUnnamedAddr() || !GV->hasInitializer() || !GV->isConstant() ||
+      !GV->isDiscardableIfUnused() || !dyn_cast<GlobalValue>(GV->getOperand(0)))
+    return false;
+
+  // To be a got equivalent, at least one of its users need to be a constant
+  // expression used by another global variable.
+  for (auto *U : GV->users())
+    NumGOTEquivUsers += getNumGlobalVariableUses(cast<Constant>(U));
+
+  return NumGOTEquivUsers > 0;
+}
+
+/// \brief Unnamed constant global variables solely contaning a pointer to
+/// another globals variable is equivalent to a GOT table entry; it contains the
+/// the address of another symbol. Optimize it and replace accesses to these
+/// "GOT equivalents" by using the GOT entry for the final global instead.
+/// Compute GOT equivalent candidates among all global variables to avoid
+/// emitting them if possible later on, after it use is replaced by a GOT entry
+/// access.
+void AsmPrinter::computeGlobalGOTEquivs(Module &M) {
+  if (!getObjFileLowering().supportIndirectSymViaGOTPCRel())
+    return;
+
+  for (const auto &G : M.globals()) {
+    unsigned NumGOTEquivUsers = 0;
+    if (!isGOTEquivalentCandidate(&G, NumGOTEquivUsers))
+      continue;
+
+    const MCSymbol *GOTEquivSym = getSymbol(&G);
+    GlobalGOTEquivs[GOTEquivSym] = std::make_pair(&G, NumGOTEquivUsers);
+  }
+}
+
+/// \brief Constant expressions using GOT equivalent globals may not be eligible
+/// for PC relative GOT entry conversion, in such cases we need to emit such
+/// globals we previously omitted in EmitGlobalVariable.
+void AsmPrinter::emitGlobalGOTEquivs() {
+  if (!getObjFileLowering().supportIndirectSymViaGOTPCRel())
+    return;
+
+  while (!GlobalGOTEquivs.empty()) {
+    DenseMap<const MCSymbol *, GOTEquivUsePair>::iterator I =
+      GlobalGOTEquivs.begin();
+    const MCSymbol *S = I->first;
+    const GlobalVariable *GV = I->second.first;
+    GlobalGOTEquivs.erase(S);
+    EmitGlobalVariable(GV);
+  }
+}
 
 bool AsmPrinter::doFinalization(Module &M) {
+  // Gather all GOT equivalent globals in the module. We really need two
+  // passes over the globals: one to compute and another to avoid its emission
+  // in EmitGlobalVariable, otherwise we would not be able to handle cases
+  // where the got equivalent shows up before its use.
+  computeGlobalGOTEquivs(M);
+
   // Emit global variables.
   for (const auto &G : M.globals())
     EmitGlobalVariable(&G);
 
+  // Emit remaining GOT equivalent globals.
+  emitGlobalGOTEquivs();
+
   // Emit visibility info for declarations
   for (const Function &F : M) {
     if (!F.isDeclaration())
@@ -875,10 +1001,15 @@
   JumpInstrTableInfo *JITI = getAnalysisIfAvailable<JumpInstrTableInfo>();
 
   if (JITI && !JITI->getTables().empty()) {
+    // Since we're at the module level we can't use a function specific
+    // MCSubtargetInfo - instead create one with the module defaults.
+    std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
+        TM.getTargetTriple(), TM.getTargetCPU(), TM.getTargetFeatureString()));
     unsigned Arch = Triple(getTargetTriple()).getArch();
     bool IsThumb = (Arch == Triple::thumb || Arch == Triple::thumbeb);
+    const TargetInstrInfo *TII = TM.getSubtargetImpl()->getInstrInfo();
     MCInst TrapInst;
-    TM.getSubtargetImpl()->getInstrInfo()->getTrap(TrapInst);
+    TII->getTrap(TrapInst);
     unsigned LogAlignment = llvm::Log2_64(JITI->entryByteAlignment());
 
     // Emit the right section for these functions.
@@ -904,9 +1035,8 @@
         const MCSymbolRefExpr *TargetSymRef =
           MCSymbolRefExpr::Create(TargetSymbol, MCSymbolRefExpr::VK_PLT,
                                   OutContext);
-        TM.getSubtargetImpl()->getInstrInfo()->getUnconditionalBranch(
-            JumpToFun, TargetSymRef);
-        OutStreamer.EmitInstruction(JumpToFun, getSubtargetInfo());
+        TII->getUnconditionalBranch(JumpToFun, TargetSymRef);
+        OutStreamer.EmitInstruction(JumpToFun, *STI);
         ++Count;
       }
 
@@ -914,7 +1044,7 @@
       uint64_t Remaining = NextPowerOf2(Count) - Count;
       for (uint64_t C = 0; C < Remaining; ++C) {
         EmitAlignment(LogAlignment);
-        OutStreamer.EmitInstruction(TrapInst, getSubtargetInfo());
+        OutStreamer.EmitInstruction(TrapInst, *STI);
       }
 
     }
@@ -974,18 +1104,34 @@
     EmitVisibility(Name, Alias.getVisibility());
 
     // Emit the directives as assignments aka .set:
-    OutStreamer.EmitAssignment(Name, lowerConstant(Alias.getAliasee(), *this));
+    OutStreamer.EmitAssignment(Name, lowerConstant(Alias.getAliasee()));
   }
 
   GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
   assert(MI && "AsmPrinter didn't require GCModuleInfo?");
   for (GCModuleInfo::iterator I = MI->end(), E = MI->begin(); I != E; )
     if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(**--I))
-      MP->finishAssembly(*this);
+      MP->finishAssembly(M, *MI, *this);
 
   // Emit llvm.ident metadata in an '.ident' directive.
   EmitModuleIdents(M);
 
+  // Emit __morestack address if needed for indirect calls.
+  if (MMI->usesMorestackAddr()) {
+    const MCSection *ReadOnlySection =
+        getObjFileLowering().getSectionForConstant(SectionKind::getReadOnly(),
+                                                   /*C=*/nullptr);
+    OutStreamer.SwitchSection(ReadOnlySection);
+
+    MCSymbol *AddrSymbol =
+        OutContext.GetOrCreateSymbol(StringRef("__morestack_addr"));
+    OutStreamer.EmitLabel(AddrSymbol);
+
+    unsigned PtrSize = TM.getDataLayout()->getPointerSize(0);
+    OutStreamer.EmitSymbolValue(GetExternalSymbolSymbol("__morestack"),
+                                PtrSize);
+  }
+
   // If we don't have any trampolines, then we don't require stack memory
   // to be executable. Some targets have a directive to declare this.
   Function *InitTrampolineIntrinsic = M.getFunction("llvm.init.trampoline");
@@ -1044,7 +1190,7 @@
     unsigned Align = CPE.getAlignment();
 
     SectionKind Kind =
-        CPE.getSectionKind(TM.getSubtargetImpl()->getDataLayout());
+        CPE.getSectionKind(TM.getDataLayout());
 
     const Constant *C = nullptr;
     if (!CPE.isMachineConstantPoolEntry())
@@ -1098,7 +1244,7 @@
 
       Type *Ty = CPE.getType();
       Offset = NewOffset +
-               TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(Ty);
+               TM.getDataLayout()->getTypeAllocSize(Ty);
 
       OutStreamer.EmitLabel(Sym);
       if (CPE.isMachineConstantPoolEntry())
@@ -1113,7 +1259,7 @@
 /// by the current function to the current output stream.
 ///
 void AsmPrinter::EmitJumpTableInfo() {
-  const DataLayout *DL = MF->getSubtarget().getDataLayout();
+  const DataLayout *DL = MF->getTarget().getDataLayout();
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
   if (!MJTI) return;
   if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_Inline) return;
@@ -1123,29 +1269,21 @@
   // Pick the directive to use to print the jump table entries, and switch to
   // the appropriate section.
   const Function *F = MF->getFunction();
-  bool JTInDiffSection = false;
-  if (// In PIC mode, we need to emit the jump table to the same section as the
-      // function body itself, otherwise the label differences won't make sense.
-      // FIXME: Need a better predicate for this: what about custom entries?
-      MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32 ||
-      // We should also do if the section name is NULL or function is declared
-      // in discardable section
-      // FIXME: this isn't the right predicate, should be based on the MCSection
-      // for the function.
-      F->isWeakForLinker()) {
-    OutStreamer.SwitchSection(
-        getObjFileLowering().SectionForGlobal(F, *Mang, TM));
+  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+  bool JTInDiffSection = !TLOF.shouldPutJumpTableInFunctionSection(
+      MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32,
+      *F);
+  if (!JTInDiffSection) {
+    OutStreamer.SwitchSection(TLOF.SectionForGlobal(F, *Mang, TM));
   } else {
     // Otherwise, drop it in the readonly section.
     const MCSection *ReadOnlySection =
-        getObjFileLowering().getSectionForConstant(SectionKind::getReadOnly(),
-                                                   /*C=*/nullptr);
+        TLOF.getSectionForJumpTable(*F, *Mang, TM);
     OutStreamer.SwitchSection(ReadOnlySection);
-    JTInDiffSection = true;
   }
 
   EmitAlignment(Log2_32(
-      MJTI->getEntryAlignment(*TM.getSubtargetImpl()->getDataLayout())));
+      MJTI->getEntryAlignment(*TM.getDataLayout())));
 
   // Jump tables in code sections are marked with a data_region directive
   // where that's supported.
@@ -1163,7 +1301,7 @@
     if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32 &&
         MAI->doesSetDirectiveSuppressesReloc()) {
       SmallPtrSet<const MachineBasicBlock*, 16> EmittedSets;
-      const TargetLowering *TLI = TM.getSubtargetImpl()->getTargetLowering();
+      const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
       const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF,JTI,OutContext);
       for (unsigned ii = 0, ee = JTBBs.size(); ii != ee; ++ii) {
         const MachineBasicBlock *MBB = JTBBs[ii];
@@ -1207,9 +1345,8 @@
   case MachineJumpTableInfo::EK_Inline:
     llvm_unreachable("Cannot emit EK_Inline jump table entry");
   case MachineJumpTableInfo::EK_Custom32:
-    Value =
-        TM.getSubtargetImpl()->getTargetLowering()->LowerCustomJumpTableEntry(
-            MJTI, MBB, UID, OutContext);
+    Value = MF->getSubtarget().getTargetLowering()->LowerCustomJumpTableEntry(
+        MJTI, MBB, UID, OutContext);
     break;
   case MachineJumpTableInfo::EK_BlockAddress:
     // EK_BlockAddress - Each entry is a plain address of block, e.g.:
@@ -1248,7 +1385,7 @@
       break;
     }
     Value = MCSymbolRefExpr::Create(MBB->getSymbol(), OutContext);
-    const TargetLowering *TLI = TM.getSubtargetImpl()->getTargetLowering();
+    const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
     const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF, UID, OutContext);
     Value = MCBinaryExpr::CreateSub(Value, Base, OutContext);
     break;
@@ -1258,7 +1395,7 @@
   assert(Value && "Unknown entry kind!");
 
   unsigned EntrySize =
-      MJTI->getEntrySize(*TM.getSubtargetImpl()->getDataLayout());
+      MJTI->getEntrySize(*TM.getDataLayout());
   OutStreamer.EmitValue(Value, EntrySize);
 }
 
@@ -1368,7 +1505,7 @@
   }
 
   // Emit the function pointers in the target-specific order
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   unsigned Align = Log2_32(DL->getPointerPrefAlignment());
   std::stable_sort(Structors.begin(), Structors.end(),
                    [](const Structor &L,
@@ -1483,25 +1620,26 @@
 //
 void AsmPrinter::EmitAlignment(unsigned NumBits, const GlobalObject *GV) const {
   if (GV)
-    NumBits = getGVAlignmentLog2(GV, *TM.getSubtargetImpl()->getDataLayout(),
+    NumBits = getGVAlignmentLog2(GV, *TM.getDataLayout(),
                                  NumBits);
 
   if (NumBits == 0) return;   // 1-byte aligned: no need to emit alignment.
 
+  assert(NumBits <
+             static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
+         "undefined behavior");
   if (getCurrentSection()->getKind().isText())
-    OutStreamer.EmitCodeAlignment(1 << NumBits);
+    OutStreamer.EmitCodeAlignment(1u << NumBits);
   else
-    OutStreamer.EmitValueToAlignment(1 << NumBits);
+    OutStreamer.EmitValueToAlignment(1u << NumBits);
 }
 
 //===----------------------------------------------------------------------===//
 // Constant emission.
 //===----------------------------------------------------------------------===//
 
-/// lowerConstant - Lower the specified LLVM Constant to an MCExpr.
-///
-static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
-  MCContext &Ctx = AP.OutContext;
+const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
+  MCContext &Ctx = OutContext;
 
   if (CV->isNullValue() || isa<UndefValue>(CV))
     return MCConstantExpr::Create(0, Ctx);
@@ -1510,19 +1648,18 @@
     return MCConstantExpr::Create(CI->getZExtValue(), Ctx);
 
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
-    return MCSymbolRefExpr::Create(AP.getSymbol(GV), Ctx);
+    return MCSymbolRefExpr::Create(getSymbol(GV), Ctx);
 
   if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
-    return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
+    return MCSymbolRefExpr::Create(GetBlockAddressSymbol(BA), Ctx);
 
   const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
   if (!CE) {
     llvm_unreachable("Unknown constant value to lower!");
   }
 
-  if (const MCExpr *RelocExpr =
-          AP.getObjFileLowering().getExecutableRelativeSymbol(CE, *AP.Mang,
-                                                              AP.TM))
+  if (const MCExpr *RelocExpr
+      = getObjFileLowering().getExecutableRelativeSymbol(CE, *Mang, TM))
     return RelocExpr;
 
   switch (CE->getOpcode()) {
@@ -1531,9 +1668,9 @@
     // opportunities. Attempt to fold the expression using DataLayout as a
     // last resort before giving up.
     if (Constant *C = ConstantFoldConstantExpression(
-            CE, AP.TM.getSubtargetImpl()->getDataLayout()))
+            CE, TM.getDataLayout()))
       if (C != CE)
-        return lowerConstant(C, AP);
+        return lowerConstant(C);
 
     // Otherwise report the problem to the user.
     {
@@ -1541,16 +1678,17 @@
       raw_string_ostream OS(S);
       OS << "Unsupported expression in static initializer: ";
       CE->printAsOperand(OS, /*PrintType=*/false,
-                     !AP.MF ? nullptr : AP.MF->getFunction()->getParent());
+                     !MF ? nullptr : MF->getFunction()->getParent());
       report_fatal_error(OS.str());
     }
   case Instruction::GetElementPtr: {
-    const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
+    const DataLayout &DL = *TM.getDataLayout();
+
     // Generate a symbolic expression for the byte address
     APInt OffsetAI(DL.getPointerTypeSizeInBits(CE->getType()), 0);
     cast<GEPOperator>(CE)->accumulateConstantOffset(DL, OffsetAI);
 
-    const MCExpr *Base = lowerConstant(CE->getOperand(0), AP);
+    const MCExpr *Base = lowerConstant(CE->getOperand(0));
     if (!OffsetAI)
       return Base;
 
@@ -1566,26 +1704,28 @@
     // is reasonable to treat their delta as a 32-bit value.
     // FALL THROUGH.
   case Instruction::BitCast:
-    return lowerConstant(CE->getOperand(0), AP);
+    return lowerConstant(CE->getOperand(0));
 
   case Instruction::IntToPtr: {
-    const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
+    const DataLayout &DL = *TM.getDataLayout();
+
     // Handle casts to pointers by changing them into casts to the appropriate
     // integer type.  This promotes constant folding and simplifies this code.
     Constant *Op = CE->getOperand(0);
     Op = ConstantExpr::getIntegerCast(Op, DL.getIntPtrType(CV->getType()),
                                       false/*ZExt*/);
-    return lowerConstant(Op, AP);
+    return lowerConstant(Op);
   }
 
   case Instruction::PtrToInt: {
-    const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
+    const DataLayout &DL = *TM.getDataLayout();
+
     // Support only foldable casts to/from pointers that can be eliminated by
     // changing the pointer to the appropriately sized integer type.
     Constant *Op = CE->getOperand(0);
     Type *Ty = CE->getType();
 
-    const MCExpr *OpExpr = lowerConstant(Op, AP);
+    const MCExpr *OpExpr = lowerConstant(Op);
 
     // We can emit the pointer value into this slot if the slot is an
     // integer slot equal to the size of the pointer.
@@ -1611,8 +1751,8 @@
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor: {
-    const MCExpr *LHS = lowerConstant(CE->getOperand(0), AP);
-    const MCExpr *RHS = lowerConstant(CE->getOperand(1), AP);
+    const MCExpr *LHS = lowerConstant(CE->getOperand(0));
+    const MCExpr *RHS = lowerConstant(CE->getOperand(1));
     switch (CE->getOpcode()) {
     default: llvm_unreachable("Unknown binary operator constant cast expr");
     case Instruction::Add: return MCBinaryExpr::CreateAdd(LHS, RHS, Ctx);
@@ -1629,7 +1769,9 @@
   }
 }
 
-static void emitGlobalConstantImpl(const Constant *C, AsmPrinter &AP);
+static void emitGlobalConstantImpl(const Constant *C, AsmPrinter &AP,
+                                   const Constant *BaseCV = nullptr,
+                                   uint64_t Offset = 0);
 
 /// isRepeatedByteSequence - Determine whether the given value is
 /// composed of a repeated sequence of identical bytes and return the
@@ -1653,7 +1795,7 @@
     if (CI->getBitWidth() > 64) return -1;
 
     uint64_t Size =
-        TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(V->getType());
+        TM.getDataLayout()->getTypeAllocSize(V->getType());
     uint64_t Value = CI->getZExtValue();
 
     // Make sure the constant is at least 8 bits long and has a power
@@ -1698,7 +1840,7 @@
   int Value = isRepeatedByteSequence(CDS, AP.TM);
   if (Value != -1) {
     uint64_t Bytes =
-        AP.TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(
+        AP.TM.getDataLayout()->getTypeAllocSize(
             CDS->getType());
     // Don't emit a 1-byte object as a .fill.
     if (Bytes > 1)
@@ -1749,7 +1891,7 @@
     }
   }
 
-  const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout &DL = *AP.TM.getDataLayout();
   unsigned Size = DL.getTypeAllocSize(CDS->getType());
   unsigned EmittedSize = DL.getTypeAllocSize(CDS->getType()->getElementType()) *
                         CDS->getNumElements();
@@ -1758,20 +1900,22 @@
 
 }
 
-static void emitGlobalConstantArray(const ConstantArray *CA, AsmPrinter &AP) {
+static void emitGlobalConstantArray(const ConstantArray *CA, AsmPrinter &AP,
+                                    const Constant *BaseCV, uint64_t Offset) {
   // See if we can aggregate some values.  Make sure it can be
   // represented as a series of bytes of the constant value.
   int Value = isRepeatedByteSequence(CA, AP.TM);
+  const DataLayout &DL = *AP.TM.getDataLayout();
 
   if (Value != -1) {
-    uint64_t Bytes =
-        AP.TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(
-            CA->getType());
+    uint64_t Bytes = DL.getTypeAllocSize(CA->getType());
     AP.OutStreamer.EmitFill(Bytes, Value);
   }
   else {
-    for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i)
-      emitGlobalConstantImpl(CA->getOperand(i), AP);
+    for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i) {
+      emitGlobalConstantImpl(CA->getOperand(i), AP, BaseCV, Offset);
+      Offset += DL.getTypeAllocSize(CA->getOperand(i)->getType());
+    }
   }
 }
 
@@ -1779,7 +1923,7 @@
   for (unsigned i = 0, e = CV->getType()->getNumElements(); i != e; ++i)
     emitGlobalConstantImpl(CV->getOperand(i), AP);
 
-  const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout &DL = *AP.TM.getDataLayout();
   unsigned Size = DL.getTypeAllocSize(CV->getType());
   unsigned EmittedSize = DL.getTypeAllocSize(CV->getType()->getElementType()) *
                          CV->getType()->getNumElements();
@@ -1787,24 +1931,25 @@
     AP.OutStreamer.EmitZeros(Padding);
 }
 
-static void emitGlobalConstantStruct(const ConstantStruct *CS, AsmPrinter &AP) {
+static void emitGlobalConstantStruct(const ConstantStruct *CS, AsmPrinter &AP,
+                                     const Constant *BaseCV, uint64_t Offset) {
   // Print the fields in successive locations. Pad to align if needed!
-  const DataLayout *DL = AP.TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = AP.TM.getDataLayout();
   unsigned Size = DL->getTypeAllocSize(CS->getType());
   const StructLayout *Layout = DL->getStructLayout(CS->getType());
   uint64_t SizeSoFar = 0;
   for (unsigned i = 0, e = CS->getNumOperands(); i != e; ++i) {
     const Constant *Field = CS->getOperand(i);
 
+    // Print the actual field value.
+    emitGlobalConstantImpl(Field, AP, BaseCV, Offset+SizeSoFar);
+
     // Check if padding is needed and insert one or more 0s.
     uint64_t FieldSize = DL->getTypeAllocSize(Field->getType());
     uint64_t PadSize = ((i == e-1 ? Size : Layout->getElementOffset(i+1))
                         - Layout->getElementOffset(i)) - FieldSize;
     SizeSoFar += FieldSize + PadSize;
 
-    // Now print the actual field value.
-    emitGlobalConstantImpl(Field, AP);
-
     // Insert padding - this may include padding to increase the size of the
     // current field up to the ABI size (if the struct is not packed) as well
     // as padding to ensure that the next field starts at the right offset.
@@ -1839,7 +1984,7 @@
 
   // PPC's long double has odd notions of endianness compared to how LLVM
   // handles it: p[0] goes first for *big* endian on PPC.
-  if (AP.TM.getSubtargetImpl()->getDataLayout()->isBigEndian() &&
+  if (AP.TM.getDataLayout()->isBigEndian() &&
       !CFP->getType()->isPPC_FP128Ty()) {
     int Chunk = API.getNumWords() - 1;
 
@@ -1858,13 +2003,13 @@
   }
 
   // Emit the tail padding for the long double.
-  const DataLayout &DL = *AP.TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout &DL = *AP.TM.getDataLayout();
   AP.OutStreamer.EmitZeros(DL.getTypeAllocSize(CFP->getType()) -
                            DL.getTypeStoreSize(CFP->getType()));
 }
 
 static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
-  const DataLayout *DL = AP.TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = AP.TM.getDataLayout();
   unsigned BitWidth = CI->getBitWidth();
 
   // Copy the value as we may massage the layout for constants whose bit width
@@ -1910,7 +2055,7 @@
     // Emit the extra bits after the 64-bits chunks.
 
     // Emit a directive that fills the expected size.
-    uint64_t Size = AP.TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(
+    uint64_t Size = AP.TM.getDataLayout()->getTypeAllocSize(
         CI->getType());
     Size -= (BitWidth / 64) * 8;
     assert(Size && Size * 8 >= ExtraBitsSize &&
@@ -1920,9 +2065,100 @@
   }
 }
 
-static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP) {
-  const DataLayout *DL = AP.TM.getSubtargetImpl()->getDataLayout();
+/// \brief Transform a not absolute MCExpr containing a reference to a GOT
+/// equivalent global, by a target specific GOT pc relative access to the
+/// final symbol.
+static void handleIndirectSymViaGOTPCRel(AsmPrinter &AP, const MCExpr **ME,
+                                         const Constant *BaseCst,
+                                         uint64_t Offset) {
+  // The global @foo below illustrates a global that uses a got equivalent.
+  //
+  //  @bar = global i32 42
+  //  @gotequiv = private unnamed_addr constant i32* @bar
+  //  @foo = i32 trunc (i64 sub (i64 ptrtoint (i32** @gotequiv to i64),
+  //                             i64 ptrtoint (i32* @foo to i64))
+  //                        to i32)
+  //
+  // The cstexpr in @foo is converted into the MCExpr `ME`, where we actually
+  // check whether @foo is suitable to use a GOTPCREL. `ME` is usually in the
+  // form:
+  //
+  //  foo = cstexpr, where
+  //    cstexpr := <gotequiv> - "." + <cst>
+  //    cstexpr := <gotequiv> - (<foo> - <offset from @foo base>) + <cst>
+  //
+  // After canonicalization by EvaluateAsRelocatable `ME` turns into:
+  //
+  //  cstexpr := <gotequiv> - <foo> + gotpcrelcst, where
+  //    gotpcrelcst := <offset from @foo base> + <cst>
+  //
+  MCValue MV;
+  if (!(*ME)->EvaluateAsRelocatable(MV, nullptr, nullptr) || MV.isAbsolute())
+    return;
+
+  const MCSymbol *GOTEquivSym = &MV.getSymA()->getSymbol();
+  if (!AP.GlobalGOTEquivs.count(GOTEquivSym))
+    return;
+
+  const GlobalValue *BaseGV = dyn_cast<GlobalValue>(BaseCst);
+  if (!BaseGV)
+    return;
+
+  const MCSymbol *BaseSym = AP.getSymbol(BaseGV);
+  if (BaseSym != &MV.getSymB()->getSymbol())
+    return;
+
+  // Make sure to match:
+  //
+  //    gotpcrelcst := <offset from @foo base> + <cst>
+  //
+  int64_t GOTPCRelCst = Offset + MV.getConstant();
+  if (GOTPCRelCst < 0)
+    return;
+
+  // Emit the GOT PC relative to replace the got equivalent global, i.e.:
+  //
+  //  bar:
+  //    .long 42
+  //  gotequiv:
+  //    .quad bar
+  //  foo:
+  //    .long gotequiv - "." + <cst>
+  //
+  // is replaced by the target specific equivalent to:
+  //
+  //  bar:
+  //    .long 42
+  //  foo:
+  //    .long bar@GOTPCREL+<gotpcrelcst>
+  //
+  AsmPrinter::GOTEquivUsePair Result = AP.GlobalGOTEquivs[GOTEquivSym];
+  const GlobalVariable *GV = Result.first;
+  unsigned NumUses = Result.second;
+  const GlobalValue *FinalGV = dyn_cast<GlobalValue>(GV->getOperand(0));
+  const MCSymbol *FinalSym = AP.getSymbol(FinalGV);
+  *ME = AP.getObjFileLowering().getIndirectSymViaGOTPCRel(FinalSym,
+                                                          GOTPCRelCst);
+
+  // Update GOT equivalent usage information
+  --NumUses;
+  if (NumUses)
+    AP.GlobalGOTEquivs[GOTEquivSym] = std::make_pair(GV, NumUses);
+  else
+    AP.GlobalGOTEquivs.erase(GOTEquivSym);
+}
+
+static void emitGlobalConstantImpl(const Constant *CV, AsmPrinter &AP,
+                                   const Constant *BaseCV, uint64_t Offset) {
+  const DataLayout *DL = AP.TM.getDataLayout();
   uint64_t Size = DL->getTypeAllocSize(CV->getType());
+
+  // Globals with sub-elements such as combinations of arrays and structs
+  // are handled recursively by emitGlobalConstantImpl. Keep track of the
+  // constant symbol base and the current position with BaseCV and Offset.
+  if (!BaseCV && CV->hasOneUse())
+    BaseCV = dyn_cast<Constant>(CV->user_back());
+
   if (isa<ConstantAggregateZero>(CV) || isa<UndefValue>(CV))
     return AP.OutStreamer.EmitZeros(Size);
 
@@ -1955,10 +2191,10 @@
     return emitGlobalConstantDataSequential(CDS, AP);
 
   if (const ConstantArray *CVA = dyn_cast<ConstantArray>(CV))
-    return emitGlobalConstantArray(CVA, AP);
+    return emitGlobalConstantArray(CVA, AP, BaseCV, Offset);
 
   if (const ConstantStruct *CVS = dyn_cast<ConstantStruct>(CV))
-    return emitGlobalConstantStruct(CVS, AP);
+    return emitGlobalConstantStruct(CVS, AP, BaseCV, Offset);
 
   if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
     // Look through bitcasts, which might not be able to be MCExpr'ized (e.g. of
@@ -1981,13 +2217,21 @@
 
   // Otherwise, it must be a ConstantExpr.  Lower it to an MCExpr, then emit it
   // thread the streamer with EmitValue.
-  AP.OutStreamer.EmitValue(lowerConstant(CV, AP), Size);
+  const MCExpr *ME = AP.lowerConstant(CV);
+
+  // Since lowerConstant already folded and got rid of all IR pointer and
+  // integer casts, detect GOT equivalent accesses by looking into the MCExpr
+  // directly.
+  if (AP.getObjFileLowering().supportIndirectSymViaGOTPCRel())
+    handleIndirectSymViaGOTPCRel(AP, &ME, BaseCV, Offset);
+
+  AP.OutStreamer.EmitValue(ME, Size);
 }
 
 /// EmitGlobalConstant - Print a general LLVM constant to the .s file.
 void AsmPrinter::EmitGlobalConstant(const Constant *CV) {
   uint64_t Size =
-      TM.getSubtargetImpl()->getDataLayout()->getTypeAllocSize(CV->getType());
+      TM.getDataLayout()->getTypeAllocSize(CV->getType());
   if (Size)
     emitGlobalConstantImpl(CV, *this);
   else if (MAI->hasSubsectionsViaSymbols()) {
@@ -2015,16 +2259,16 @@
 
 /// GetTempSymbol - Return the MCSymbol corresponding to the assembler
 /// temporary label with the specified stem and unique ID.
-MCSymbol *AsmPrinter::GetTempSymbol(Twine Name, unsigned ID) const {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+MCSymbol *AsmPrinter::GetTempSymbol(const Twine &Name, unsigned ID) const {
+  const DataLayout *DL = TM.getDataLayout();
   return OutContext.GetOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix()) +
                                       Name + Twine(ID));
 }
 
 /// GetTempSymbol - Return an assembler temporary label with the specified
 /// stem.
-MCSymbol *AsmPrinter::GetTempSymbol(Twine Name) const {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+MCSymbol *AsmPrinter::GetTempSymbol(const Twine &Name) const {
+  const DataLayout *DL = TM.getDataLayout();
   return OutContext.GetOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix())+
                                       Name);
 }
@@ -2040,7 +2284,7 @@
 
 /// GetCPISymbol - Return the symbol for the specified constant pool entry.
 MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   return OutContext.GetOrCreateSymbol
     (Twine(DL->getPrivateGlobalPrefix()) + "CPI" + Twine(getFunctionNumber())
      + "_" + Twine(CPID));
@@ -2054,7 +2298,7 @@
 /// GetJTSetSymbol - Return the symbol for the specified jump table .set
 /// FIXME: privatize to AsmPrinter.
 MCSymbol *AsmPrinter::GetJTSetSymbol(unsigned UID, unsigned MBBID) const {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   return OutContext.GetOrCreateSymbol
   (Twine(DL->getPrivateGlobalPrefix()) + Twine(getFunctionNumber()) + "_" +
    Twine(UID) + "_set_" + Twine(MBBID));
@@ -2252,6 +2496,11 @@
   if (!S.usesMetadata())
     return nullptr;
 
+  assert(!S.useStatepoints() && "statepoints do not currently support custom"
+         " stackmap formats, please see the documentation for a description of"
+         " the default format.  If you really need a custom serialized format,"
+         " please file a bug");
+
   gcp_map_type &GCMap = getGCMap(GCMetadataPrinters);
   gcp_map_type::iterator GCPI = GCMap.find(&S);
   if (GCPI != GCMap.end())

diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index 05f6a68..d0958c1 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp

@@ -12,26 +12,44 @@
 //===----------------------------------------------------------------------===//
 
 #include "ByteStreamer.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/ADT/SmallBitVector.h"
+#include "DwarfExpression.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
+void DebugLocDwarfExpression::EmitOp(uint8_t Op, const char *Comment) {
+  BS.EmitInt8(
+      Op, Comment ? Twine(Comment) + " " + dwarf::OperationEncodingString(Op)
+                  : dwarf::OperationEncodingString(Op));
+}
+
+void DebugLocDwarfExpression::EmitSigned(int Value) {
+  BS.EmitSLEB128(Value, Twine(Value));
+}
+
+void DebugLocDwarfExpression::EmitUnsigned(unsigned Value) {
+  BS.EmitULEB128(Value, Twine(Value));
+}
+
+bool DebugLocDwarfExpression::isFrameRegister(unsigned MachineReg) {
+  // This information is not available while emitting .debug_loc entries.
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Dwarf Emission Helper Routines
 //===----------------------------------------------------------------------===//
@@ -131,7 +149,7 @@
   default:
     llvm_unreachable("Invalid encoded value.");
   case dwarf::DW_EH_PE_absptr:
-    return TM.getSubtargetImpl()->getDataLayout()->getPointerSize();
+    return TM.getDataLayout()->getPointerSize();
   case dwarf::DW_EH_PE_udata2:
     return 2;
   case dwarf::DW_EH_PE_udata4:
@@ -187,57 +205,6 @@
   EmitLabelDifference(Label, SectionLabel, 4);
 }
 
-/// Emit a dwarf register operation.
-static void emitDwarfRegOp(ByteStreamer &Streamer, int Reg) {
-  assert(Reg >= 0);
-  if (Reg < 32) {
-    Streamer.EmitInt8(dwarf::DW_OP_reg0 + Reg,
-                      dwarf::OperationEncodingString(dwarf::DW_OP_reg0 + Reg));
-  } else {
-    Streamer.EmitInt8(dwarf::DW_OP_regx, "DW_OP_regx");
-    Streamer.EmitULEB128(Reg, Twine(Reg));
-  }
-}
-
-/// Emit an (double-)indirect dwarf register operation.
-static void emitDwarfRegOpIndirect(ByteStreamer &Streamer, int Reg, int Offset,
-                                   bool Deref) {
-  assert(Reg >= 0);
-  if (Reg < 32) {
-    Streamer.EmitInt8(dwarf::DW_OP_breg0 + Reg,
-                      dwarf::OperationEncodingString(dwarf::DW_OP_breg0 + Reg));
-  } else {
-    Streamer.EmitInt8(dwarf::DW_OP_bregx, "DW_OP_bregx");
-    Streamer.EmitULEB128(Reg, Twine(Reg));
-  }
-  Streamer.EmitSLEB128(Offset);
-  if (Deref)
-    Streamer.EmitInt8(dwarf::DW_OP_deref, "DW_OP_deref");
-}
-
-void AsmPrinter::EmitDwarfOpPiece(ByteStreamer &Streamer, unsigned SizeInBits,
-                                  unsigned OffsetInBits) const {
-  assert(SizeInBits > 0 && "piece has size zero");
-  const unsigned SizeOfByte = 8;
-  if (OffsetInBits > 0 || SizeInBits % SizeOfByte) {
-    Streamer.EmitInt8(dwarf::DW_OP_bit_piece, "DW_OP_bit_piece");
-    Streamer.EmitULEB128(SizeInBits, Twine(SizeInBits));
-    Streamer.EmitULEB128(OffsetInBits, Twine(OffsetInBits));
-  } else {
-    Streamer.EmitInt8(dwarf::DW_OP_piece, "DW_OP_piece");
-    unsigned ByteSize = SizeInBits / SizeOfByte;
-    Streamer.EmitULEB128(ByteSize, Twine(ByteSize));
-  }
-}
-
-/// Emit a shift-right dwarf expression.
-static void emitDwarfOpShr(ByteStreamer &Streamer,
-                           unsigned ShiftBy) {
-  Streamer.EmitInt8(dwarf::DW_OP_constu, "DW_OP_constu");
-  Streamer.EmitULEB128(ShiftBy);
-  Streamer.EmitInt8(dwarf::DW_OP_shr, "DW_OP_shr");
-}
-
 // Some targets do not provide a DWARF register number for every
 // register.  This function attempts to emit a DWARF register by
 // emitting a piece of a super-register or by piecing together
@@ -247,112 +214,44 @@
                                      unsigned PieceSizeInBits,
                                      unsigned PieceOffsetInBits) const {
   assert(MLoc.isReg() && "MLoc must be a register");
-  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
-  int Reg = TRI->getDwarfRegNum(MLoc.getReg(), false);
+  DebugLocDwarfExpression Expr(*this, Streamer);
+  Expr.AddMachineRegPiece(MLoc.getReg(), PieceSizeInBits, PieceOffsetInBits);
+}
 
-  // If this is a valid register number, emit it.
-  if (Reg >= 0) {
-    emitDwarfRegOp(Streamer, Reg);
-    EmitDwarfOpPiece(Streamer, PieceSizeInBits, PieceOffsetInBits);
-    return;
-  }
-
-  // Walk up the super-register chain until we find a valid number.
-  // For example, EAX on x86_64 is a 32-bit piece of RAX with offset 0.
-  for (MCSuperRegIterator SR(MLoc.getReg(), TRI); SR.isValid(); ++SR) {
-    Reg = TRI->getDwarfRegNum(*SR, false);
-    if (Reg >= 0) {
-      unsigned Idx = TRI->getSubRegIndex(*SR, MLoc.getReg());
-      unsigned Size = TRI->getSubRegIdxSize(Idx);
-      unsigned RegOffset = TRI->getSubRegIdxOffset(Idx);
-      OutStreamer.AddComment("super-register");
-      emitDwarfRegOp(Streamer, Reg);
-      if (PieceOffsetInBits == RegOffset) {
-        EmitDwarfOpPiece(Streamer, Size, RegOffset);
-      } else {
-        // If this is part of a variable in a sub-register at a
-        // non-zero offset, we need to manually shift the value into
-        // place, since the DW_OP_piece describes the part of the
-        // variable, not the position of the subregister.
-        if (RegOffset)
-          emitDwarfOpShr(Streamer, RegOffset);
-        EmitDwarfOpPiece(Streamer, Size, PieceOffsetInBits);
-      }
-      return;
-    }
-  }
-
-  // Otherwise, attempt to find a covering set of sub-register numbers.
-  // For example, Q0 on ARM is a composition of D0+D1.
-  //
-  // Keep track of the current position so we can emit the more
-  // efficient DW_OP_piece.
-  unsigned CurPos = PieceOffsetInBits;
-  // The size of the register in bits, assuming 8 bits per byte.
-  unsigned RegSize = TRI->getMinimalPhysRegClass(MLoc.getReg())->getSize() * 8;
-  // Keep track of the bits in the register we already emitted, so we
-  // can avoid emitting redundant aliasing subregs.
-  SmallBitVector Coverage(RegSize, false);
-  for (MCSubRegIterator SR(MLoc.getReg(), TRI); SR.isValid(); ++SR) {
-    unsigned Idx = TRI->getSubRegIndex(MLoc.getReg(), *SR);
-    unsigned Size = TRI->getSubRegIdxSize(Idx);
-    unsigned Offset = TRI->getSubRegIdxOffset(Idx);
-    Reg = TRI->getDwarfRegNum(*SR, false);
-
-    // Intersection between the bits we already emitted and the bits
-    // covered by this subregister.
-    SmallBitVector Intersection(RegSize, false);
-    Intersection.set(Offset, Offset + Size);
-    Intersection ^= Coverage;
-
-    // If this sub-register has a DWARF number and we haven't covered
-    // its range, emit a DWARF piece for it.
-    if (Reg >= 0 && Intersection.any()) {
-      OutStreamer.AddComment("sub-register");
-      emitDwarfRegOp(Streamer, Reg);
-      EmitDwarfOpPiece(Streamer, Size, Offset == CurPos ? 0 : Offset);
-      CurPos = Offset + Size;
-
-      // Mark it as emitted.
-      Coverage.set(Offset, Offset + Size);
-    }
-  }
-
-  if (CurPos == PieceOffsetInBits) {
-    // FIXME: We have no reasonable way of handling errors in here.
-    Streamer.EmitInt8(dwarf::DW_OP_nop,
-                      "nop (could not find a dwarf register number)");
-  }
+void AsmPrinter::EmitDwarfOpPiece(ByteStreamer &Streamer,
+                                  unsigned PieceSizeInBits,
+                                  unsigned PieceOffsetInBits) const {
+  DebugLocDwarfExpression Expr(*this, Streamer);
+  Expr.AddOpPiece(PieceSizeInBits, PieceOffsetInBits);
 }
 
 /// EmitDwarfRegOp - Emit dwarf register operation.
 void AsmPrinter::EmitDwarfRegOp(ByteStreamer &Streamer,
-                                const MachineLocation &MLoc,
-                                bool Indirect) const {
-  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
-  int Reg = TRI->getDwarfRegNum(MLoc.getReg(), false);
+                                const MachineLocation &MLoc) const {
+  DebugLocDwarfExpression Expr(*this, Streamer);
+  const MCRegisterInfo *MRI = MMI->getContext().getRegisterInfo();
+  int Reg = MRI->getDwarfRegNum(MLoc.getReg(), false);
   if (Reg < 0) {
     // We assume that pointers are always in an addressable register.
-    if (Indirect || MLoc.isIndirect()) {
+    if (MLoc.isIndirect())
       // FIXME: We have no reasonable way of handling errors in here. The
       // caller might be in the middle of a dwarf expression. We should
       // probably assert that Reg >= 0 once debug info generation is more
       // mature.
-      Streamer.EmitInt8(dwarf::DW_OP_nop,
-                        "nop (invalid dwarf register number for indirect loc)");
-      return;
-    }
+      return Expr.EmitOp(dwarf::DW_OP_nop,
+                         "nop (could not find a dwarf register number)");
 
     // Attempt to find a valid super- or sub-register.
-    return EmitDwarfRegOpPiece(Streamer, MLoc);
+    if (!Expr.AddMachineRegPiece(MLoc.getReg()))
+      Expr.EmitOp(dwarf::DW_OP_nop,
+                  "nop (could not find a dwarf register number)");
+    return;
   }
 
   if (MLoc.isIndirect())
-    emitDwarfRegOpIndirect(Streamer, Reg, MLoc.getOffset(), Indirect);
-  else if (Indirect)
-    emitDwarfRegOpIndirect(Streamer, Reg, 0, false);
+    Expr.AddRegIndirect(Reg, MLoc.getOffset());
   else
-    emitDwarfRegOp(Streamer, Reg);
+    Expr.AddReg(Reg);
 }
 
 //===----------------------------------------------------------------------===//

diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index cca5f22..e6e7c97 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp

@@ -32,6 +32,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -64,7 +65,7 @@
 
     if (LocInfo->getNumOperands() != 0)
       if (const ConstantInt *CI =
-          dyn_cast<ConstantInt>(LocInfo->getOperand(ErrorLine)))
+              mdconst::dyn_extract<ConstantInt>(LocInfo->getOperand(ErrorLine)))
         LocCookie = CI->getZExtValue();
   }
 
@@ -90,8 +91,19 @@
   assert(MCAI && "No MCAsmInfo");
   if (!MCAI->useIntegratedAssembler() &&
       !OutStreamer.isIntegratedAssemblerRequired()) {
+    emitInlineAsmStart();
     OutStreamer.EmitRawText(Str);
-    emitInlineAsmEnd(TM.getSubtarget<MCSubtargetInfo>(), nullptr);
+    // If we have a machine function then grab the MCSubtarget off of that,
+    // otherwise we're at the module level and want to construct one from
+    // the default CPU and target triple.
+    if (MF) {
+      emitInlineAsmEnd(MF->getSubtarget<MCSubtargetInfo>(), nullptr);
+    } else {
+      std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
+          TM.getTargetTriple(), TM.getTargetCPU(),
+          TM.getTargetFeatureString()));
+      emitInlineAsmEnd(*STI, nullptr);
+    }
     return;
   }
 
@@ -137,11 +149,13 @@
   // emitInlineAsmEnd().
   MCSubtargetInfo STIOrig = *STI;
 
-  MCTargetOptions MCOptions;
-  if (MF)
-    MCOptions = MF->getTarget().Options.MCOptions;
-  std::unique_ptr<MCTargetAsmParser> TAP(
-      TM.getTarget().createMCAsmParser(*STI, *Parser, *MII, MCOptions));
+  // We create a new MCInstrInfo here since we might be at the module level
+  // and not have a MachineFunction to initialize the TargetInstrInfo from and
+  // we only need MCInstrInfo for asm parsing. We create one unconditionally
+  // because it's not subtarget dependent.
+  std::unique_ptr<MCInstrInfo> MII(TM.getTarget().createMCInstrInfo());
+  std::unique_ptr<MCTargetAsmParser> TAP(TM.getTarget().createMCAsmParser(
+      *STI, *Parser, *MII, TM.Options.MCOptions));
   if (!TAP)
     report_fatal_error("Inline asm not supported by this streamer because"
                        " we don't have an asm parser for this target\n");
@@ -152,6 +166,7 @@
     TAP->SetFrameRegister(TRI->getFrameRegister(*MF));
   }
 
+  emitInlineAsmStart();
   // Don't implicitly switch to the text section before the asm.
   int Res = Parser->Run(/*NoInitialTextSection*/ true,
                         /*NoFinalize*/ true);
@@ -467,7 +482,8 @@
     if (MI->getOperand(i-1).isMetadata() &&
         (LocMD = MI->getOperand(i-1).getMetadata()) &&
         LocMD->getNumOperands() != 0) {
-      if (const ConstantInt *CI = dyn_cast<ConstantInt>(LocMD->getOperand(0))) {
+      if (const ConstantInt *CI =
+              mdconst::dyn_extract<ConstantInt>(LocMD->getOperand(0))) {
         LocCookie = CI->getZExtValue();
         break;
       }
@@ -505,7 +521,7 @@
 /// for their own strange codes.
 void AsmPrinter::PrintSpecial(const MachineInstr *MI, raw_ostream &OS,
                               const char *Code) const {
-  const DataLayout *DL = TM.getSubtargetImpl()->getDataLayout();
+  const DataLayout *DL = TM.getDataLayout();
   if (!strcmp(Code, "private")) {
     OS << DL->getPrivateGlobalPrefix();
   } else if (!strcmp(Code, "comment")) {
@@ -566,5 +582,7 @@
   return true;
 }
 
+void AsmPrinter::emitInlineAsmStart() const {}
+
 void AsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
                                   const MCSubtargetInfo *EndInfo) const {}

diff --git a/lib/CodeGen/AsmPrinter/ByteStreamer.h b/lib/CodeGen/AsmPrinter/ByteStreamer.h
index 0cc8353..42be114 100644
--- a/lib/CodeGen/AsmPrinter/ByteStreamer.h
+++ b/lib/CodeGen/AsmPrinter/ByteStreamer.h

@@ -15,10 +15,10 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_BYTESTREAMER_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_BYTESTREAMER_H
 
+#include "DIEHash.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/MC/MCStreamer.h"
-#include "DIEHash.h"
 
 namespace llvm {
 class ByteStreamer {

diff --git a/lib/CodeGen/AsmPrinter/CMakeLists.txt b/lib/CodeGen/AsmPrinter/CMakeLists.txt
index e6b7d64..01d2c72 100644
--- a/lib/CodeGen/AsmPrinter/CMakeLists.txt
+++ b/lib/CodeGen/AsmPrinter/CMakeLists.txt

@@ -11,6 +11,7 @@
   DwarfCFIException.cpp
   DwarfCompileUnit.cpp
   DwarfDebug.cpp
+  DwarfExpression.cpp
   DwarfFile.cpp
   DwarfStringPool.cpp
   DwarfUnit.cpp

diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index 50ea369..64ba56b 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp

@@ -11,8 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DIE.h"
-
+#include "llvm/CodeGen/DIE.h"
 #include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
 #include "DwarfUnit.h"

diff --git a/lib/CodeGen/AsmPrinter/DIE.h b/lib/CodeGen/AsmPrinter/DIE.h
deleted file mode 100644
index e310aef..0000000
--- a/lib/CodeGen/AsmPrinter/DIE.h
+++ /dev/null

@@ -1,587 +0,0 @@
-//===--- lib/CodeGen/DIE.h - DWARF Info Entries -----------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Data structures for DWARF info entries.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DIE_H
-#define LLVM_LIB_CODEGEN_ASMPRINTER_DIE_H
-
-#include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Dwarf.h"
-#include <vector>
-
-namespace llvm {
-class AsmPrinter;
-class MCExpr;
-class MCSymbol;
-class raw_ostream;
-class DwarfTypeUnit;
-
-//===--------------------------------------------------------------------===//
-/// DIEAbbrevData - Dwarf abbreviation data, describes one attribute of a
-/// Dwarf abbreviation.
-class DIEAbbrevData {
-  /// Attribute - Dwarf attribute code.
-  ///
-  dwarf::Attribute Attribute;
-
-  /// Form - Dwarf form code.
-  ///
-  dwarf::Form Form;
-
-public:
-  DIEAbbrevData(dwarf::Attribute A, dwarf::Form F) : Attribute(A), Form(F) {}
-
-  // Accessors.
-  dwarf::Attribute getAttribute() const { return Attribute; }
-  dwarf::Form getForm() const { return Form; }
-
-  /// Profile - Used to gather unique data for the abbreviation folding set.
-  ///
-  void Profile(FoldingSetNodeID &ID) const;
-};
-
-//===--------------------------------------------------------------------===//
-/// DIEAbbrev - Dwarf abbreviation, describes the organization of a debug
-/// information object.
-class DIEAbbrev : public FoldingSetNode {
-  /// Unique number for node.
-  ///
-  unsigned Number;
-
-  /// Tag - Dwarf tag code.
-  ///
-  dwarf::Tag Tag;
-
-  /// Children - Whether or not this node has children.
-  ///
-  // This cheats a bit in all of the uses since the values in the standard
-  // are 0 and 1 for no children and children respectively.
-  bool Children;
-
-  /// Data - Raw data bytes for abbreviation.
-  ///
-  SmallVector<DIEAbbrevData, 12> Data;
-
-public:
-  DIEAbbrev(dwarf::Tag T, bool C) : Tag(T), Children(C), Data() {}
-
-  // Accessors.
-  dwarf::Tag getTag() const { return Tag; }
-  unsigned getNumber() const { return Number; }
-  bool hasChildren() const { return Children; }
-  const SmallVectorImpl<DIEAbbrevData> &getData() const { return Data; }
-  void setChildrenFlag(bool hasChild) { Children = hasChild; }
-  void setNumber(unsigned N) { Number = N; }
-
-  /// AddAttribute - Adds another set of attribute information to the
-  /// abbreviation.
-  void AddAttribute(dwarf::Attribute Attribute, dwarf::Form Form) {
-    Data.push_back(DIEAbbrevData(Attribute, Form));
-  }
-
-  /// Profile - Used to gather unique data for the abbreviation folding set.
-  ///
-  void Profile(FoldingSetNodeID &ID) const;
-
-  /// Emit - Print the abbreviation using the specified asm printer.
-  ///
-  void Emit(AsmPrinter *AP) const;
-
-#ifndef NDEBUG
-  void print(raw_ostream &O);
-  void dump();
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIE - A structured debug information entry.  Has an abbreviation which
-/// describes its organization.
-class DIEValue;
-
-class DIE {
-protected:
-  /// Offset - Offset in debug info section.
-  ///
-  unsigned Offset;
-
-  /// Size - Size of instance + children.
-  ///
-  unsigned Size;
-
-  /// Abbrev - Buffer for constructing abbreviation.
-  ///
-  DIEAbbrev Abbrev;
-
-  /// Children DIEs.
-  ///
-  // This can't be a vector<DIE> because pointer validity is requirent for the
-  // Parent pointer and DIEEntry.
-  // It can't be a list<DIE> because some clients need pointer validity before
-  // the object has been added to any child list
-  // (eg: DwarfUnit::constructVariableDIE). These aren't insurmountable, but may
-  // be more convoluted than beneficial.
-  std::vector<std::unique_ptr<DIE>> Children;
-
-  DIE *Parent;
-
-  /// Attribute values.
-  ///
-  SmallVector<DIEValue *, 12> Values;
-
-protected:
-  DIE()
-      : Offset(0), Size(0), Abbrev((dwarf::Tag)0, dwarf::DW_CHILDREN_no),
-        Parent(nullptr) {}
-
-public:
-  explicit DIE(dwarf::Tag Tag)
-      : Offset(0), Size(0), Abbrev((dwarf::Tag)Tag, dwarf::DW_CHILDREN_no),
-        Parent(nullptr) {}
-
-  // Accessors.
-  DIEAbbrev &getAbbrev() { return Abbrev; }
-  const DIEAbbrev &getAbbrev() const { return Abbrev; }
-  unsigned getAbbrevNumber() const { return Abbrev.getNumber(); }
-  dwarf::Tag getTag() const { return Abbrev.getTag(); }
-  unsigned getOffset() const { return Offset; }
-  unsigned getSize() const { return Size; }
-  const std::vector<std::unique_ptr<DIE>> &getChildren() const {
-    return Children;
-  }
-  const SmallVectorImpl<DIEValue *> &getValues() const { return Values; }
-  DIE *getParent() const { return Parent; }
-  /// Climb up the parent chain to get the compile or type unit DIE this DIE
-  /// belongs to.
-  const DIE *getUnit() const;
-  /// Similar to getUnit, returns null when DIE is not added to an
-  /// owner yet.
-  const DIE *getUnitOrNull() const;
-  void setOffset(unsigned O) { Offset = O; }
-  void setSize(unsigned S) { Size = S; }
-
-  /// addValue - Add a value and attributes to a DIE.
-  ///
-  void addValue(dwarf::Attribute Attribute, dwarf::Form Form, DIEValue *Value) {
-    Abbrev.AddAttribute(Attribute, Form);
-    Values.push_back(Value);
-  }
-
-  /// addChild - Add a child to the DIE.
-  ///
-  void addChild(std::unique_ptr<DIE> Child) {
-    assert(!Child->getParent());
-    Abbrev.setChildrenFlag(dwarf::DW_CHILDREN_yes);
-    Child->Parent = this;
-    Children.push_back(std::move(Child));
-  }
-
-  /// findAttribute - Find a value in the DIE with the attribute given,
-  /// returns NULL if no such attribute exists.
-  DIEValue *findAttribute(dwarf::Attribute Attribute) const;
-
-#ifndef NDEBUG
-  void print(raw_ostream &O, unsigned IndentCount = 0) const;
-  void dump();
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIEValue - A debug information entry value. Some of these roughly correlate
-/// to DWARF attribute classes.
-///
-class DIEValue {
-  virtual void anchor();
-
-public:
-  enum Type {
-    isInteger,
-    isString,
-    isExpr,
-    isLabel,
-    isDelta,
-    isEntry,
-    isTypeSignature,
-    isBlock,
-    isLoc,
-    isLocList,
-  };
-
-protected:
-  /// Ty - Type of data stored in the value.
-  ///
-  Type Ty;
-
-  explicit DIEValue(Type T) : Ty(T) {}
-  virtual ~DIEValue() {}
-
-public:
-  // Accessors
-  Type getType() const { return Ty; }
-
-  /// EmitValue - Emit value via the Dwarf writer.
-  ///
-  virtual void EmitValue(AsmPrinter *AP, dwarf::Form Form) const = 0;
-
-  /// SizeOf - Return the size of a value in bytes.
-  ///
-  virtual unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const = 0;
-
-#ifndef NDEBUG
-  virtual void print(raw_ostream &O) const = 0;
-  void dump() const;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIEInteger - An integer value DIE.
-///
-class DIEInteger : public DIEValue {
-  uint64_t Integer;
-
-public:
-  explicit DIEInteger(uint64_t I) : DIEValue(isInteger), Integer(I) {}
-
-  /// BestForm - Choose the best form for integer.
-  ///
-  static dwarf::Form BestForm(bool IsSigned, uint64_t Int) {
-    if (IsSigned) {
-      const int64_t SignedInt = Int;
-      if ((char)Int == SignedInt)
-        return dwarf::DW_FORM_data1;
-      if ((short)Int == SignedInt)
-        return dwarf::DW_FORM_data2;
-      if ((int)Int == SignedInt)
-        return dwarf::DW_FORM_data4;
-    } else {
-      if ((unsigned char)Int == Int)
-        return dwarf::DW_FORM_data1;
-      if ((unsigned short)Int == Int)
-        return dwarf::DW_FORM_data2;
-      if ((unsigned int)Int == Int)
-        return dwarf::DW_FORM_data4;
-    }
-    return dwarf::DW_FORM_data8;
-  }
-
-  /// EmitValue - Emit integer of appropriate size.
-  ///
-  void EmitValue(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  uint64_t getValue() const { return Integer; }
-
-  /// SizeOf - Determine size of integer value in bytes.
-  ///
-  unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  // Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *I) { return I->getType() == isInteger; }
-
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIEExpr - An expression DIE.
-//
-class DIEExpr : public DIEValue {
-  const MCExpr *Expr;
-
-public:
-  explicit DIEExpr(const MCExpr *E) : DIEValue(isExpr), Expr(E) {}
-
-  /// EmitValue - Emit expression value.
-  ///
-  void EmitValue(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  /// getValue - Get MCExpr.
-  ///
-  const MCExpr *getValue() const { return Expr; }
-
-  /// SizeOf - Determine size of expression value in bytes.
-  ///
-  unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  // Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *E) { return E->getType() == isExpr; }
-
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIELabel - A label DIE.
-//
-class DIELabel : public DIEValue {
-  const MCSymbol *Label;
-
-public:
-  explicit DIELabel(const MCSymbol *L) : DIEValue(isLabel), Label(L) {}
-
-  /// EmitValue - Emit label value.
-  ///
-  void EmitValue(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  /// getValue - Get MCSymbol.
-  ///
-  const MCSymbol *getValue() const { return Label; }
-
-  /// SizeOf - Determine size of label value in bytes.
-  ///
-  unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  // Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *L) { return L->getType() == isLabel; }
-
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIEDelta - A simple label difference DIE.
-///
-class DIEDelta : public DIEValue {
-  const MCSymbol *LabelHi;
-  const MCSymbol *LabelLo;
-
-public:
-  DIEDelta(const MCSymbol *Hi, const MCSymbol *Lo)
-      : DIEValue(isDelta), LabelHi(Hi), LabelLo(Lo) {}
-
-  /// EmitValue - Emit delta value.
-  ///
-  void EmitValue(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  /// SizeOf - Determine size of delta value in bytes.
-  ///
-  unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  // Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *D) { return D->getType() == isDelta; }
-
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIEString - A container for string values.
-///
-class DIEString : public DIEValue {
-  const DIEValue *Access;
-  StringRef Str;
-
-public:
-  DIEString(const DIEValue *Acc, StringRef S)
-      : DIEValue(isString), Access(Acc), Str(S) {}
-
-  /// getString - Grab the string out of the object.
-  StringRef getString() const { return Str; }
-
-  /// EmitValue - Emit delta value.
-  ///
-  void EmitValue(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  /// SizeOf - Determine size of delta value in bytes.
-  ///
-  unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  // Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *D) { return D->getType() == isString; }
-
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIEEntry - A pointer to another debug information entry.  An instance of
-/// this class can also be used as a proxy for a debug information entry not
-/// yet defined (ie. types.)
-class DIEEntry : public DIEValue {
-  DIE &Entry;
-
-public:
-  explicit DIEEntry(DIE &E) : DIEValue(isEntry), Entry(E) {
-  }
-
-  DIE &getEntry() const { return Entry; }
-
-  /// EmitValue - Emit debug information entry offset.
-  ///
-  void EmitValue(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  /// SizeOf - Determine size of debug information entry in bytes.
-  ///
-   unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override {
-    return Form == dwarf::DW_FORM_ref_addr ? getRefAddrSize(AP)
-                                           : sizeof(int32_t);
-  }
-
-  /// Returns size of a ref_addr entry.
-  static unsigned getRefAddrSize(AsmPrinter *AP);
-
-  // Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *E) { return E->getType() == isEntry; }
-
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// \brief A signature reference to a type unit.
-class DIETypeSignature : public DIEValue {
-  const DwarfTypeUnit &Unit;
-
-public:
-  explicit DIETypeSignature(const DwarfTypeUnit &Unit)
-      : DIEValue(isTypeSignature), Unit(Unit) {}
-
-  /// \brief Emit type unit signature.
-  void EmitValue(AsmPrinter *Asm, dwarf::Form Form) const override;
-
-  /// Returns size of a ref_sig8 entry.
-  unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override {
-    assert(Form == dwarf::DW_FORM_ref_sig8);
-    return 8;
-  }
-
-  // \brief Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *E) {
-    return E->getType() == isTypeSignature;
-  }
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-  void dump() const;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIELoc - Represents an expression location.
-//
-class DIELoc : public DIEValue, public DIE {
-  mutable unsigned Size; // Size in bytes excluding size header.
-public:
-  DIELoc() : DIEValue(isLoc), Size(0) {}
-
-  /// ComputeSize - Calculate the size of the location expression.
-  ///
-  unsigned ComputeSize(AsmPrinter *AP) const;
-
-  /// BestForm - Choose the best form for data.
-  ///
-  dwarf::Form BestForm(unsigned DwarfVersion) const {
-    if (DwarfVersion > 3)
-      return dwarf::DW_FORM_exprloc;
-    // Pre-DWARF4 location expressions were blocks and not exprloc.
-    if ((unsigned char)Size == Size)
-      return dwarf::DW_FORM_block1;
-    if ((unsigned short)Size == Size)
-      return dwarf::DW_FORM_block2;
-    if ((unsigned int)Size == Size)
-      return dwarf::DW_FORM_block4;
-    return dwarf::DW_FORM_block;
-  }
-
-  /// EmitValue - Emit location data.
-  ///
-  void EmitValue(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  /// SizeOf - Determine size of location data in bytes.
-  ///
-  unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  // Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *E) { return E->getType() == isLoc; }
-
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIEBlock - Represents a block of values.
-//
-class DIEBlock : public DIEValue, public DIE {
-  mutable unsigned Size; // Size in bytes excluding size header.
-public:
-  DIEBlock() : DIEValue(isBlock), Size(0) {}
-
-  /// ComputeSize - Calculate the size of the location expression.
-  ///
-  unsigned ComputeSize(AsmPrinter *AP) const;
-
-  /// BestForm - Choose the best form for data.
-  ///
-  dwarf::Form BestForm() const {
-    if ((unsigned char)Size == Size)
-      return dwarf::DW_FORM_block1;
-    if ((unsigned short)Size == Size)
-      return dwarf::DW_FORM_block2;
-    if ((unsigned int)Size == Size)
-      return dwarf::DW_FORM_block4;
-    return dwarf::DW_FORM_block;
-  }
-
-  /// EmitValue - Emit location data.
-  ///
-  void EmitValue(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  /// SizeOf - Determine size of location data in bytes.
-  ///
-  unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  // Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *E) { return E->getType() == isBlock; }
-
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-#endif
-};
-
-//===--------------------------------------------------------------------===//
-/// DIELocList - Represents a pointer to a location list in the debug_loc
-/// section.
-//
-class DIELocList : public DIEValue {
-  // Index into the .debug_loc vector.
-  size_t Index;
-
-public:
-  DIELocList(size_t I) : DIEValue(isLocList), Index(I) {}
-
-  /// getValue - Grab the current index out.
-  size_t getValue() const { return Index; }
-
-  /// EmitValue - Emit location data.
-  ///
-  void EmitValue(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  /// SizeOf - Determine size of location data in bytes.
-  ///
-  unsigned SizeOf(AsmPrinter *AP, dwarf::Form Form) const override;
-
-  // Implement isa/cast/dyncast.
-  static bool classof(const DIEValue *E) { return E->getType() == isLocList; }
-
-#ifndef NDEBUG
-  void print(raw_ostream &O) const override;
-#endif
-};
-
-} // end llvm namespace
-
-#endif

diff --git a/lib/CodeGen/AsmPrinter/DIEHash.cpp b/lib/CodeGen/AsmPrinter/DIEHash.cpp
index b2a3ba8..1e2ba2c 100644
--- a/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/lib/CodeGen/AsmPrinter/DIEHash.cpp

@@ -13,11 +13,11 @@
 
 #include "ByteStreamer.h"
 #include "DIEHash.h"
-#include "DIE.h"
 #include "DwarfDebug.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Endian.h"

diff --git a/lib/CodeGen/AsmPrinter/DIEHash.h b/lib/CodeGen/AsmPrinter/DIEHash.h
index 872aa0e..ac014b7 100644
--- a/lib/CodeGen/AsmPrinter/DIEHash.h
+++ b/lib/CodeGen/AsmPrinter/DIEHash.h

@@ -14,8 +14,8 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DIEHASH_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DIEHASH_H
 
-#include "DIE.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/Support/MD5.h"
 
 namespace llvm {

diff --git a/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/lib/CodeGen/AsmPrinter/DebugLocEntry.h
index 6cca985..6d55c03 100644
--- a/lib/CodeGen/AsmPrinter/DebugLocEntry.h
+++ b/lib/CodeGen/AsmPrinter/DebugLocEntry.h

@@ -11,8 +11,8 @@
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
-#include "llvm/MC/MachineLocation.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MachineLocation.h"
 
 namespace llvm {
 class MDNode;
@@ -74,7 +74,7 @@
     MachineLocation getLoc() const { return Loc; }
     const MDNode *getVariableNode() const { return Variable; }
     DIVariable getVariable() const { return DIVariable(Variable); }
-    bool isVariablePiece() const { return getExpression().isVariablePiece(); }
+    bool isBitPiece() const { return getExpression().isBitPiece(); }
     DIExpression getExpression() const { return DIExpression(Expression); }
     friend bool operator==(const Value &, const Value &);
     friend bool operator<(const Value &, const Value &);
@@ -101,8 +101,8 @@
       DIVariable Var(Values[0].Variable);
       DIExpression NextExpr(Next.Values[0].Expression);
       DIVariable NextVar(Next.Values[0].Variable);
-      if (Var == NextVar && Expr.isVariablePiece() &&
-          NextExpr.isVariablePiece()) {
+      if (Var == NextVar && Expr.isBitPiece() &&
+          NextExpr.isBitPiece()) {
         addValues(Next.Values);
         End = Next.End;
         return true;
@@ -131,7 +131,7 @@
     Values.append(Vals.begin(), Vals.end());
     sortUniqueValues();
     assert(std::all_of(Values.begin(), Values.end(), [](DebugLocEntry::Value V){
-          return V.isVariablePiece();
+          return V.isBitPiece();
         }) && "value must be a piece");
   }
 
@@ -176,8 +176,8 @@
 /// Compare two pieces based on their offset.
 inline bool operator<(const DebugLocEntry::Value &A,
                       const DebugLocEntry::Value &B) {
-  return A.getExpression().getPieceOffset() <
-         B.getExpression().getPieceOffset();
+  return A.getExpression().getBitPieceOffset() <
+         B.getExpression().getBitPieceOffset();
 }
 
 }

diff --git a/lib/CodeGen/AsmPrinter/DebugLocList.h b/lib/CodeGen/AsmPrinter/DebugLocList.h
index 2a4f58f..0f1d2ed 100644
--- a/lib/CodeGen/AsmPrinter/DebugLocList.h
+++ b/lib/CodeGen/AsmPrinter/DebugLocList.h

@@ -10,8 +10,8 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCLIST_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCLIST_H
 
-#include "llvm/ADT/SmallVector.h"
 #include "DebugLocEntry.h"
+#include "llvm/ADT/SmallVector.h"
 
 namespace llvm {
 class DwarfCompileUnit;

diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
index 7e87566..a71f35e 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp

@@ -12,12 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "DwarfAccelTable.h"
-#include "DIE.h"
 #include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"

diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
index 3cdf678..74963da 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h

@@ -14,9 +14,9 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFACCELTABLE_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFACCELTABLE_H
 
-#include "DIE.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
@@ -215,8 +215,8 @@
 #endif
   };
 
-  DwarfAccelTable(const DwarfAccelTable &) LLVM_DELETED_FUNCTION;
-  void operator=(const DwarfAccelTable &) LLVM_DELETED_FUNCTION;
+  DwarfAccelTable(const DwarfAccelTable &) = delete;
+  void operator=(const DwarfAccelTable &) = delete;
 
   // Internal Functions
   void EmitHeader(AsmPrinter *);

diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index 0dc52da..f45b24c 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp

@@ -51,7 +51,8 @@
   if (moveTypeModule == AsmPrinter::CFI_M_Debug)
     Asm->OutStreamer.EmitCFISections(false, true);
 
-  if (!Asm->MAI->usesItaniumLSDAForExceptions())
+  // SjLj uses this pass and it doesn't need this info.
+  if (!Asm->MAI->usesCFIForEH())
     return;
 
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
@@ -90,7 +91,7 @@
 
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   unsigned PerEncoding = TLOF.getPersonalityEncoding();
-  const Function *Per = MMI->getPersonalities()[MMI->getPersonalityIndex()];
+  const Function *Per = MMI->getPersonality();
 
   shouldEmitPersonality = hasLandingPads &&
     PerEncoding != dwarf::DW_EH_PE_omit && Per;

diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 2f1b0e5..dcc5fe4 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp

@@ -1,5 +1,5 @@
 #include "DwarfCompileUnit.h"
-
+#include "DwarfExpression.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GlobalValue.h"
@@ -10,8 +10,8 @@
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
 
 namespace llvm {
 
@@ -103,7 +103,7 @@
 
   assert(GV.isGlobalVariable());
 
-  DIScope GVContext = DD->resolve(GV.getContext());
+  DIScope GVContext = GV.getContext();
   DIType GTy = DD->resolve(GV.getType());
 
   // Construct the context before querying for the existence of the DIE in
@@ -122,7 +122,7 @@
     DIE *VariableSpecDIE = getOrCreateStaticMemberDIE(SDMDecl);
     addDIEEntry(*VariableDIE, dwarf::DW_AT_specification, *VariableSpecDIE);
   } else {
-    DeclContext = resolve(GV.getContext());
+    DeclContext = GV.getContext();
     // Add name and type.
     addString(*VariableDIE, dwarf::DW_AT_name, GV.getDisplayName());
     addType(*VariableDIE, GTy);
@@ -292,10 +292,10 @@
 
   // Only include DW_AT_frame_base in full debug info
   if (!includeMinimalInlineScopes()) {
-    const TargetRegisterInfo *RI =
-        Asm->TM.getSubtargetImpl()->getRegisterInfo();
+    const TargetRegisterInfo *RI = Asm->MF->getSubtarget().getRegisterInfo();
     MachineLocation Location(RI->getFrameRegister(*Asm->MF));
-    addAddress(*SPDie, dwarf::DW_AT_frame_base, Location);
+    if (RI->isPhysicalRegister(Location.getReg()))
+      addAddress(*SPDie, dwarf::DW_AT_frame_base, Location);
   }
 
   // Add name to the name table, we do this here because we're guaranteed
@@ -515,15 +515,23 @@
   }
 
   // .. else use frame index.
-  int FI = DV.getFrameIndex();
-  if (FI != ~0) {
+  if (DV.getFrameIndex().back() == ~0)
+    return VariableDie;
+
+  auto Expr = DV.getExpression().begin();
+  DIELoc *Loc = new (DIEValueAllocator) DIELoc();
+  DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+  for (auto FI : DV.getFrameIndex()) {
     unsigned FrameReg = 0;
-    const TargetFrameLowering *TFI =
-        Asm->TM.getSubtargetImpl()->getFrameLowering();
+    const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();
     int Offset = TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
-    MachineLocation Location(FrameReg, Offset);
-    addVariableAddress(DV, *VariableDie, Location);
+    assert(Expr != DV.getExpression().end() &&
+           "Wrong number of expressions");
+    DwarfExpr.AddMachineRegIndirect(FrameReg, Offset);
+    DwarfExpr.AddExpression(Expr->begin(), Expr->end());
+    ++Expr;
   }
+  addBlock(*VariableDie, dwarf::DW_AT_location, Loc);
 
   return VariableDie;
 }
@@ -694,7 +702,7 @@
   for (unsigned vi = 0, ve = Variables.getNumElements(); vi != ve; ++vi) {
     DIVariable DV(Variables.getElement(vi));
     assert(DV.isVariable());
-    DbgVariable NewVar(DV, DIExpression(nullptr), DD);
+    DbgVariable NewVar(DV, DIExpression(), DD);
     auto VariableDie = constructVariableDIE(NewVar);
     applyVariableAttributes(NewVar, *VariableDie);
     SPDIE->addChild(std::move(VariableDie));
@@ -736,24 +744,22 @@
   else if (DV.isBlockByrefVariable())
     addBlockByrefAddress(DV, Die, dwarf::DW_AT_location, Location);
   else
-    addAddress(Die, dwarf::DW_AT_location, Location,
-               DV.getVariable().isIndirect());
+    addAddress(Die, dwarf::DW_AT_location, Location);
 }
 
 /// Add an address attribute to a die based on the location provided.
 void DwarfCompileUnit::addAddress(DIE &Die, dwarf::Attribute Attribute,
-                                  const MachineLocation &Location,
-                                  bool Indirect) {
+                                  const MachineLocation &Location) {
   DIELoc *Loc = new (DIEValueAllocator) DIELoc();
 
-  if (Location.isReg() && !Indirect)
-    addRegisterOpPiece(*Loc, Location.getReg());
-  else {
-    addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
-    if (Indirect && !Location.isReg()) {
-      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-    }
-  }
+  bool validReg;
+  if (Location.isReg())
+    validReg = addRegisterOpPiece(*Loc, Location.getReg());
+  else
+    validReg = addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
+
+  if (!validReg)
+    return;
 
   // Now attach the location information to the DIE.
   addBlock(Die, Attribute, Loc);
@@ -767,53 +773,21 @@
                                          dwarf::Attribute Attribute,
                                          const MachineLocation &Location) {
   DIELoc *Loc = new (DIEValueAllocator) DIELoc();
-  unsigned N = DV.getNumAddrElements();
-  unsigned i = 0;
-  if (Location.isReg()) {
-    if (N >= 2 && DV.getAddrElement(0) == dwarf::DW_OP_plus) {
-      assert(!DV.getVariable().isIndirect() &&
-             "double indirection not handled");
-      // If first address element is OpPlus then emit
-      // DW_OP_breg + Offset instead of DW_OP_reg + Offset.
-      addRegisterOffset(*Loc, Location.getReg(), DV.getAddrElement(1));
-      i = 2;
-    } else if (N >= 2 && DV.getAddrElement(0) == dwarf::DW_OP_deref) {
-      assert(!DV.getVariable().isIndirect() &&
-             "double indirection not handled");
-      addRegisterOpPiece(*Loc, Location.getReg(),
-                         DV.getExpression().getPieceSize(),
-                         DV.getExpression().getPieceOffset());
-      i = 3;
-    } else
-      addRegisterOpPiece(*Loc, Location.getReg());
+  DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+  assert(DV.getExpression().size() == 1);
+  DIExpression Expr = DV.getExpression().back();
+  bool ValidReg;
+  if (Location.getOffset()) {
+    ValidReg = DwarfExpr.AddMachineRegIndirect(Location.getReg(),
+                                               Location.getOffset());
+    if (ValidReg)
+      DwarfExpr.AddExpression(Expr.begin(), Expr.end());
   } else
-    addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
-
-  for (; i < N; ++i) {
-    uint64_t Element = DV.getAddrElement(i);
-    if (Element == dwarf::DW_OP_plus) {
-      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
-      addUInt(*Loc, dwarf::DW_FORM_udata, DV.getAddrElement(++i));
-
-    } else if (Element == dwarf::DW_OP_deref) {
-      if (!Location.isReg())
-        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
-
-    } else if (Element == dwarf::DW_OP_piece) {
-      const unsigned SizeOfByte = 8;
-      unsigned PieceOffsetInBits = DV.getAddrElement(++i) * SizeOfByte;
-      unsigned PieceSizeInBits = DV.getAddrElement(++i) * SizeOfByte;
-      // Emit DW_OP_bit_piece Size Offset.
-      assert(PieceSizeInBits > 0 && "piece has zero size");
-      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_bit_piece);
-      addUInt(*Loc, dwarf::DW_FORM_udata, PieceSizeInBits);
-      addUInt(*Loc, dwarf::DW_FORM_udata, PieceOffsetInBits);
-    } else
-      llvm_unreachable("unknown DIBuilder Opcode");
-  }
+    ValidReg = DwarfExpr.AddMachineRegExpression(Expr, Location.getReg());
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, Loc);
+  if (ValidReg)
+    addBlock(Die, Attribute, Loc);
 }
 
 /// Add a Dwarf loclistptr attribute data and value.

diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index e521f39..c66af65 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h

@@ -15,9 +15,9 @@
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
 
 #include "DwarfUnit.h"
-#include "llvm/Support/Dwarf.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/Support/Dwarf.h"
 
 namespace llvm {
 
@@ -213,7 +213,7 @@
                           MachineLocation Location);
   /// Add an address attribute to a die based on the location provided.
   void addAddress(DIE &Die, dwarf::Attribute Attribute,
-                  const MachineLocation &Location, bool Indirect = false);
+                  const MachineLocation &Location);
 
   /// Start with the address based on the location provided, and generate the
   /// DWARF information necessary to find the actual variable (navigating the

diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 230ea46..aa1f79f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp

@@ -12,16 +12,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "DwarfDebug.h"
-
 #include "ByteStreamer.h"
-#include "DwarfCompileUnit.h"
-#include "DIE.h"
 #include "DIEHash.h"
+#include "DwarfCompileUnit.h"
+#include "DwarfExpression.h"
 #include "DwarfUnit.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/IR/Constants.h"
@@ -490,9 +490,6 @@
 
   // Tell MMI that we have debug info.
   MMI->setDebugInfoAvailability(true);
-
-  // Prime section data.
-  SectionMap[Asm->getObjFileLowering().getTextSection()];
 }
 
 void DwarfDebug::finishVariableDefinitions() {
@@ -608,53 +605,6 @@
     SkeletonHolder.computeSizeAndOffsets();
 }
 
-void DwarfDebug::endSections() {
-  // Filter labels by section.
-  for (const SymbolCU &SCU : ArangeLabels) {
-    if (SCU.Sym->isInSection()) {
-      // Make a note of this symbol and it's section.
-      const MCSection *Section = &SCU.Sym->getSection();
-      if (!Section->getKind().isMetadata())
-        SectionMap[Section].push_back(SCU);
-    } else {
-      // Some symbols (e.g. common/bss on mach-o) can have no section but still
-      // appear in the output. This sucks as we rely on sections to build
-      // arange spans. We can do it without, but it's icky.
-      SectionMap[nullptr].push_back(SCU);
-    }
-  }
-
-  // Build a list of sections used.
-  std::vector<const MCSection *> Sections;
-  for (const auto &it : SectionMap) {
-    const MCSection *Section = it.first;
-    Sections.push_back(Section);
-  }
-
-  // Sort the sections into order.
-  // This is only done to ensure consistent output order across different runs.
-  std::sort(Sections.begin(), Sections.end(), SectionSort);
-
-  // Add terminating symbols for each section.
-  for (unsigned ID = 0, E = Sections.size(); ID != E; ID++) {
-    const MCSection *Section = Sections[ID];
-    MCSymbol *Sym = nullptr;
-
-    if (Section) {
-      // We can't call MCSection::getLabelEndName, as it's only safe to do so
-      // if we know the section name up-front. For user-created sections, the
-      // resulting label may not be valid to use as a label. (section names can
-      // use a greater set of characters on some systems)
-      Sym = Asm->GetTempSymbol("debug_end", ID);
-      Asm->OutStreamer.SwitchSection(Section);
-      Asm->OutStreamer.EmitLabel(Sym);
-    }
-
-    // Insert a final terminator.
-    SectionMap[Section].push_back(SymbolCU(nullptr, Sym));
-  }
-}
-
 // Emit all Dwarf sections that should come after the content.
 void DwarfDebug::endModule() {
   assert(CurFn == nullptr);
@@ -666,10 +616,6 @@
   if (!DwarfInfoSectionSym)
     return;
 
-  // End any existing sections.
-  // TODO: Does this need to happen?
-  endSections();
-
   // Finalize the debug info for the module.
   finalizeModuleInfo();
 
@@ -783,10 +729,9 @@
     DIVariable DV(VI.Var);
     DIExpression Expr(VI.Expr);
     ensureAbstractVariableIsCreatedIfScoped(DV, Scope->getScopeNode());
-    ConcreteVariables.push_back(make_unique<DbgVariable>(DV, Expr, this));
-    DbgVariable *RegVar = ConcreteVariables.back().get();
-    RegVar->setFrameIndex(VI.Slot);
-    InfoHolder.addScopeVariable(Scope, RegVar);
+    auto RegVar = make_unique<DbgVariable>(DV, Expr, this, VI.Slot);
+    if (InfoHolder.addScopeVariable(Scope, RegVar.get()))
+      ConcreteVariables.push_back(std::move(RegVar));
   }
 }
 
@@ -818,12 +763,12 @@
 
 /// Determine whether two variable pieces overlap.
 static bool piecesOverlap(DIExpression P1, DIExpression P2) {
-  if (!P1.isVariablePiece() || !P2.isVariablePiece())
+  if (!P1.isBitPiece() || !P2.isBitPiece())
     return true;
-  unsigned l1 = P1.getPieceOffset();
-  unsigned l2 = P2.getPieceOffset();
-  unsigned r1 = l1 + P1.getPieceSize();
-  unsigned r2 = l2 + P2.getPieceSize();
+  unsigned l1 = P1.getBitPieceOffset();
+  unsigned l2 = P2.getBitPieceOffset();
+  unsigned r1 = l1 + P1.getBitPieceSize();
+  unsigned r2 = l2 + P2.getBitPieceSize();
   // True where [l1,r1[ and [r1,r2[ overlap.
   return (l1 < r2) && (l2 < r1);
 }
@@ -842,7 +787,8 @@
 // 1 | |    [x, (reg1, piece 32, 32)] <- IsPieceOfPrevEntry
 // 2 | |    ...
 // 3   |    [clobber reg0]
-// 4        [x, (mem, piece 0, 64)] <- overlapping with both previous pieces of x.
+// 4        [x, (mem, piece 0, 64)] <- overlapping with both previous pieces of
+//                                     x.
 //
 // Output:
 //
@@ -894,7 +840,7 @@
     bool couldMerge = false;
 
     // If this is a piece, it may belong to the current DebugLocEntry.
-    if (DIExpr.isVariablePiece()) {
+    if (DIExpr.isBitPiece()) {
       // Add this value to the list of open ranges.
       OpenRanges.push_back(Value);
 
@@ -950,11 +896,9 @@
       continue;
 
     LexicalScope *Scope = nullptr;
-    if (MDNode *IA = DV.getInlinedAt()) {
-      DebugLoc DL = DebugLoc::getFromDILocation(IA);
-      Scope = LScopes.findInlinedScope(DebugLoc::get(
-          DL.getLine(), DL.getCol(), DV.getContext(), IA));
-    } else
+    if (MDNode *IA = DV.getInlinedAt())
+      Scope = LScopes.findInlinedScope(DV.getContext(), IA);
+    else
       Scope = LScopes.findLexicalScope(DV.getContext());
     // If variable scope is not found then skip this variable.
     if (!Scope)
@@ -1026,8 +970,10 @@
       if (DL == PrologEndLoc) {
         Flags |= DWARF2_FLAG_PROLOGUE_END;
         PrologEndLoc = DebugLoc();
+        Flags |= DWARF2_FLAG_IS_STMT;
       }
-      if (PrologEndLoc.isUnknown())
+      if (DL.getLine() !=
+          Asm->OutStreamer.getContext().getCurrentDwarfLoc().getLine())
         Flags |= DWARF2_FLAG_IS_STMT;
 
       if (!DL.isUnknown()) {
@@ -1117,8 +1063,12 @@
   for (const auto &MBB : *MF)
     for (const auto &MI : MBB)
       if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) &&
-          !MI.getDebugLoc().isUnknown())
+          !MI.getDebugLoc().isUnknown()) {
+        // Did the target forget to set the FrameSetup flag for CFI insns?
+        assert(!MI.isCFIInstruction() &&
+               "First non-frame-setup instruction is a CFI instruction.");
         return MI.getDebugLoc();
+      }
   return DebugLoc();
 }
 
@@ -1172,7 +1122,7 @@
   Asm->OutStreamer.EmitLabel(FunctionBeginSym);
 
   // Calculate history for local variables.
-  calculateDbgValueHistory(MF, Asm->TM.getSubtargetImpl()->getRegisterInfo(),
+  calculateDbgValueHistory(MF, Asm->MF->getSubtarget().getRegisterInfo(),
                            DbgValues);
 
   // Request labels for the full history.
@@ -1187,7 +1137,7 @@
     if (DIVar.isVariable() && DIVar.getTag() == dwarf::DW_TAG_arg_variable &&
         getDISubprogram(DIVar.getContext()).describes(MF->getFunction())) {
       LabelsBeforeInsn[Ranges.front().first] = FunctionBeginSym;
-      if (Ranges.front().first->getDebugExpression().isVariablePiece()) {
+      if (Ranges.front().first->getDebugExpression().isBitPiece()) {
         // Mark all non-overlapping initial pieces.
         for (auto I = Ranges.begin(); I != Ranges.end(); ++I) {
           DIExpression Piece = I->first->getDebugExpression();
@@ -1217,12 +1167,12 @@
   if (!PrologEndLoc.isUnknown()) {
     DebugLoc FnStartDL =
         PrologEndLoc.getFnDebugLoc(MF->getFunction()->getContext());
-    recordSourceLine(
-        FnStartDL.getLine(), FnStartDL.getCol(),
-        FnStartDL.getScope(MF->getFunction()->getContext()),
-        // We'd like to list the prologue as "not statements" but GDB behaves
-        // poorly if we do that. Revisit this with caution/GDB (7.5+) testing.
-        DWARF2_FLAG_IS_STMT);
+
+    // We'd like to list the prologue as "not statements" but GDB behaves
+    // poorly if we do that. Revisit this with caution/GDB (7.5+) testing.
+    recordSourceLine(FnStartDL.getLine(), FnStartDL.getCol(),
+                     FnStartDL.getScope(MF->getFunction()->getContext()),
+                     DWARF2_FLAG_IS_STMT);
   }
 }
 
@@ -1350,8 +1300,8 @@
   if (useSplitDwarf()) {
     DwarfInfoDWOSectionSym =
         emitSectionSym(Asm, TLOF.getDwarfInfoDWOSection(), "section_info_dwo");
-    DwarfTypesDWOSectionSym =
-        emitSectionSym(Asm, TLOF.getDwarfTypesDWOSection(), "section_types_dwo");
+    DwarfTypesDWOSectionSym = emitSectionSym(
+        Asm, TLOF.getDwarfTypesDWOSection(), "section_types_dwo");
   }
   DwarfAbbrevSectionSym =
       emitSectionSym(Asm, TLOF.getDwarfAbbrevSection(), "section_abbrev");
@@ -1553,7 +1503,6 @@
     return dwarf::GIEK_TYPE;
   case dwarf::DW_TAG_subprogram:
     return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_FUNCTION, Linkage);
-  case dwarf::DW_TAG_constant:
   case dwarf::DW_TAG_variable:
     return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_VARIABLE, Linkage);
   case dwarf::DW_TAG_enumerator:
@@ -1656,7 +1605,7 @@
                                const DITypeIdentifierMap &Map,
                                ArrayRef<DebugLocEntry::Value> Values) {
   assert(std::all_of(Values.begin(), Values.end(), [](DebugLocEntry::Value P) {
-        return P.isVariablePiece();
+        return P.isBitPiece();
       }) && "all values are expected to be pieces");
   assert(std::is_sorted(Values.begin(), Values.end()) &&
          "pieces are expected to be sorted");
@@ -1664,35 +1613,25 @@
   unsigned Offset = 0;
   for (auto Piece : Values) {
     DIExpression Expr = Piece.getExpression();
-    unsigned PieceOffset = Expr.getPieceOffset();
-    unsigned PieceSize = Expr.getPieceSize();
+    unsigned PieceOffset = Expr.getBitPieceOffset();
+    unsigned PieceSize = Expr.getBitPieceSize();
     assert(Offset <= PieceOffset && "overlapping or duplicate pieces");
     if (Offset < PieceOffset) {
       // The DWARF spec seriously mandates pieces with no locations for gaps.
-      Asm->EmitDwarfOpPiece(Streamer, (PieceOffset-Offset)*8);
+      Asm->EmitDwarfOpPiece(Streamer, PieceOffset-Offset);
       Offset += PieceOffset-Offset;
     }
-
     Offset += PieceSize;
 
-    const unsigned SizeOfByte = 8;
 #ifndef NDEBUG
     DIVariable Var = Piece.getVariable();
-    assert(!Var.isIndirect() && "indirect address for piece");
     unsigned VarSize = Var.getSizeInBits(Map);
-    assert(PieceSize+PieceOffset <= VarSize/SizeOfByte
+    assert(PieceSize+PieceOffset <= VarSize
            && "piece is larger than or outside of variable");
-    assert(PieceSize*SizeOfByte != VarSize
+    assert(PieceSize != VarSize
            && "piece covers entire variable");
 #endif
-    if (Piece.isLocation() && Piece.getLoc().isReg())
-      Asm->EmitDwarfRegOpPiece(Streamer,
-                               Piece.getLoc(),
-                               PieceSize*SizeOfByte);
-    else {
-      emitDebugLocValue(Streamer, Piece);
-      Asm->EmitDwarfOpPiece(Streamer, PieceSize*SizeOfByte);
-    }
+    emitDebugLocValue(Streamer, Piece, PieceOffset);
   }
 }
 
@@ -1700,7 +1639,7 @@
 void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
                                    const DebugLocEntry &Entry) {
   const DebugLocEntry::Value Value = Entry.getValues()[0];
-  if (Value.isVariablePiece())
+  if (Value.isBitPiece())
     // Emit all pieces that belong to the same variable and range.
     return emitLocPieces(Streamer, TypeIdentifierMap, Entry.getValues());
 
@@ -1709,62 +1648,33 @@
 }
 
 void DwarfDebug::emitDebugLocValue(ByteStreamer &Streamer,
-                                   const DebugLocEntry::Value &Value) {
+                                   const DebugLocEntry::Value &Value,
+                                   unsigned PieceOffsetInBits) {
   DIVariable DV = Value.getVariable();
+  DebugLocDwarfExpression DwarfExpr(*Asm, Streamer);
+
   // Regular entry.
   if (Value.isInt()) {
     DIBasicType BTy(resolve(DV.getType()));
     if (BTy.Verify() && (BTy.getEncoding() == dwarf::DW_ATE_signed ||
-                         BTy.getEncoding() == dwarf::DW_ATE_signed_char)) {
-      Streamer.EmitInt8(dwarf::DW_OP_consts, "DW_OP_consts");
-      Streamer.EmitSLEB128(Value.getInt());
-    } else {
-      Streamer.EmitInt8(dwarf::DW_OP_constu, "DW_OP_constu");
-      Streamer.EmitULEB128(Value.getInt());
-    }
+                         BTy.getEncoding() == dwarf::DW_ATE_signed_char))
+      DwarfExpr.AddSignedConstant(Value.getInt());
+    else
+      DwarfExpr.AddUnsignedConstant(Value.getInt());
   } else if (Value.isLocation()) {
     MachineLocation Loc = Value.getLoc();
     DIExpression Expr = Value.getExpression();
-    if (!Expr)
+    if (!Expr || (Expr.getNumElements() == 0))
       // Regular entry.
-      Asm->EmitDwarfRegOp(Streamer, Loc, DV.isIndirect());
+      Asm->EmitDwarfRegOp(Streamer, Loc);
     else {
       // Complex address entry.
-      unsigned N = Expr.getNumElements();
-      unsigned i = 0;
-      if (N >= 2 && Expr.getElement(0) == dwarf::DW_OP_plus) {
-        if (Loc.getOffset()) {
-          i = 2;
-          Asm->EmitDwarfRegOp(Streamer, Loc, DV.isIndirect());
-          Streamer.EmitInt8(dwarf::DW_OP_deref, "DW_OP_deref");
-          Streamer.EmitInt8(dwarf::DW_OP_plus_uconst, "DW_OP_plus_uconst");
-          Streamer.EmitSLEB128(Expr.getElement(1));
-        } else {
-          // If first address element is OpPlus then emit
-          // DW_OP_breg + Offset instead of DW_OP_reg + Offset.
-          MachineLocation TLoc(Loc.getReg(), Expr.getElement(1));
-          Asm->EmitDwarfRegOp(Streamer, TLoc, DV.isIndirect());
-          i = 2;
-        }
-      } else {
-        Asm->EmitDwarfRegOp(Streamer, Loc, DV.isIndirect());
-      }
-
-      // Emit remaining complex address elements.
-      for (; i < N; ++i) {
-        uint64_t Element = Expr.getElement(i);
-        if (Element == dwarf::DW_OP_plus) {
-          Streamer.EmitInt8(dwarf::DW_OP_plus_uconst, "DW_OP_plus_uconst");
-          Streamer.EmitULEB128(Expr.getElement(++i));
-        } else if (Element == dwarf::DW_OP_deref) {
-          if (!Loc.isReg())
-            Streamer.EmitInt8(dwarf::DW_OP_deref, "DW_OP_deref");
-        } else if (Element == dwarf::DW_OP_piece) {
-          i += 3;
-          // handled in emitDebugLocEntry.
-        } else
-          llvm_unreachable("unknown Opcode found in complex address");
-      }
+      if (Loc.getOffset()) {
+        DwarfExpr.AddMachineRegIndirect(Loc.getReg(), Loc.getOffset());
+        DwarfExpr.AddExpression(Expr.begin(), Expr.end(), PieceOffsetInBits);
+      } else
+        DwarfExpr.AddMachineRegExpression(Expr, Loc.getReg(),
+                                          PieceOffsetInBits);
     }
   }
   // else ... ignore constant fp. There is not any good way to
@@ -1841,13 +1751,26 @@
 // Emit a debug aranges section, containing a CU lookup for any
 // address we can tie back to a CU.
 void DwarfDebug::emitDebugARanges() {
-  // Start the dwarf aranges section.
-  Asm->OutStreamer.SwitchSection(
-      Asm->getObjFileLowering().getDwarfARangesSection());
+  // Provides a unique id per text section.
+  DenseMap<const MCSection *, SmallVector<SymbolCU, 8>> SectionMap;
 
-  typedef DenseMap<DwarfCompileUnit *, std::vector<ArangeSpan>> SpansType;
+  // Prime section data.
+  SectionMap[Asm->getObjFileLowering().getTextSection()];
 
-  SpansType Spans;
+  // Filter labels by section.
+  for (const SymbolCU &SCU : ArangeLabels) {
+    if (SCU.Sym->isInSection()) {
+      // Make a note of this symbol and it's section.
+      const MCSection *Section = &SCU.Sym->getSection();
+      if (!Section->getKind().isMetadata())
+        SectionMap[Section].push_back(SCU);
+    } else {
+      // Some symbols (e.g. common/bss on mach-o) can have no section but still
+      // appear in the output. This sucks as we rely on sections to build
+      // arange spans. We can do it without, but it's icky.
+      SectionMap[nullptr].push_back(SCU);
+    }
+  }
 
   // Build a list of sections used.
   std::vector<const MCSection *> Sections;
@@ -1860,12 +1783,45 @@
   // This is only done to ensure consistent output order across different runs.
   std::sort(Sections.begin(), Sections.end(), SectionSort);
 
-  // Build a set of address spans, sorted by CU.
+  // Add terminating symbols for each section.
+  for (unsigned ID = 0, E = Sections.size(); ID != E; ID++) {
+    const MCSection *Section = Sections[ID];
+    MCSymbol *Sym = nullptr;
+
+    if (Section) {
+      // We can't call MCSection::getLabelEndName, as it's only safe to do so
+      // if we know the section name up-front. For user-created sections, the
+      // resulting label may not be valid to use as a label. (section names can
+      // use a greater set of characters on some systems)
+      Sym = Asm->GetTempSymbol("debug_end", ID);
+      Asm->OutStreamer.SwitchSection(Section);
+      Asm->OutStreamer.EmitLabel(Sym);
+    }
+
+    // Insert a final terminator.
+    SectionMap[Section].push_back(SymbolCU(nullptr, Sym));
+  }
+
+  DenseMap<DwarfCompileUnit *, std::vector<ArangeSpan>> Spans;
+
   for (const MCSection *Section : Sections) {
     SmallVector<SymbolCU, 8> &List = SectionMap[Section];
     if (List.size() < 2)
       continue;
 
+    // If we have no section (e.g. common), just write out
+    // individual spans for each symbol.
+    if (!Section) {
+      for (const SymbolCU &Cur : List) {
+        ArangeSpan Span;
+        Span.Start = Cur.Sym;
+        Span.End = nullptr;
+        if (Cur.CU)
+          Spans[Cur.CU].push_back(Span);
+      }
+      continue;
+    }
+
     // Sort the symbols by offset within the section.
     std::sort(List.begin(), List.end(),
               [&](const SymbolCU &A, const SymbolCU &B) {
@@ -1881,35 +1837,27 @@
       return IA < IB;
     });
 
-    // If we have no section (e.g. common), just write out
-    // individual spans for each symbol.
-    if (!Section) {
-      for (const SymbolCU &Cur : List) {
-        ArangeSpan Span;
-        Span.Start = Cur.Sym;
-        Span.End = nullptr;
-        if (Cur.CU)
-          Spans[Cur.CU].push_back(Span);
-      }
-    } else {
-      // Build spans between each label.
-      const MCSymbol *StartSym = List[0].Sym;
-      for (size_t n = 1, e = List.size(); n < e; n++) {
-        const SymbolCU &Prev = List[n - 1];
-        const SymbolCU &Cur = List[n];
+    // Build spans between each label.
+    const MCSymbol *StartSym = List[0].Sym;
+    for (size_t n = 1, e = List.size(); n < e; n++) {
+      const SymbolCU &Prev = List[n - 1];
+      const SymbolCU &Cur = List[n];
 
-        // Try and build the longest span we can within the same CU.
-        if (Cur.CU != Prev.CU) {
-          ArangeSpan Span;
-          Span.Start = StartSym;
-          Span.End = Cur.Sym;
-          Spans[Prev.CU].push_back(Span);
-          StartSym = Cur.Sym;
-        }
+      // Try and build the longest span we can within the same CU.
+      if (Cur.CU != Prev.CU) {
+        ArangeSpan Span;
+        Span.Start = StartSym;
+        Span.End = Cur.Sym;
+        Spans[Prev.CU].push_back(Span);
+        StartSym = Cur.Sym;
       }
     }
   }
 
+  // Start the dwarf aranges section.
+  Asm->OutStreamer.SwitchSection(
+      Asm->getObjFileLowering().getDwarfARangesSection());
+
   unsigned PtrSize = Asm->getDataLayout().getPointerSize();
 
   // Build a list of CUs used.

diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 48c2809..1c0e163 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h

@@ -14,26 +14,25 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFDEBUG_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFDEBUG_H
 
-#include "DwarfFile.h"
 #include "AsmPrinterHandler.h"
-#include "DIE.h"
 #include "DbgValueHistoryCalculator.h"
 #include "DebugLocEntry.h"
 #include "DebugLocList.h"
 #include "DwarfAccelTable.h"
+#include "DwarfFile.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/FoldingSet.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/CodeGen/LexicalScopes.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DebugLoc.h"
-#include "llvm/MC/MachineLocation.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/Allocator.h"
-
 #include <memory>
 
 namespace llvm {
@@ -68,41 +67,67 @@
 
 //===----------------------------------------------------------------------===//
 /// \brief This class is used to track local variable information.
+///
+/// - Variables whose location changes over time have a DotDebugLocOffset and
+///   the other fields are not used.
+///
+/// - Variables that are described by multiple MMI table entries have multiple
+///   expressions and frame indices.
 class DbgVariable {
-  DIVariable Var;             // Variable Descriptor.
-  DIExpression Expr;          // Complex address location expression.
-  DIE *TheDIE;                // Variable DIE.
-  unsigned DotDebugLocOffset; // Offset in DotDebugLocEntries.
-  const MachineInstr *MInsn;  // DBG_VALUE instruction of the variable.
-  int FrameIndex;
+  DIVariable Var;             /// Variable Descriptor.
+  SmallVector<DIExpression, 1> Expr; /// Complex address location expression.
+  DIE *TheDIE;                /// Variable DIE.
+  unsigned DotDebugLocOffset; /// Offset in DotDebugLocEntries.
+  const MachineInstr *MInsn;  /// DBG_VALUE instruction of the variable.
+  SmallVector<int, 1> FrameIndex; /// Frame index of the variable.
   DwarfDebug *DD;
 
 public:
   /// Construct a DbgVariable from a DIVariable.
-  DbgVariable(DIVariable V, DIExpression E, DwarfDebug *DD)
-      : Var(V), Expr(E), TheDIE(nullptr), DotDebugLocOffset(~0U),
-        MInsn(nullptr), FrameIndex(~0), DD(DD) {
-    assert(Var.Verify() && Expr.Verify());
+    DbgVariable(DIVariable V, DIExpression E, DwarfDebug *DD, int FI = ~0)
+    : Var(V), Expr(1, E), TheDIE(nullptr), DotDebugLocOffset(~0U),
+      MInsn(nullptr), DD(DD) {
+    FrameIndex.push_back(FI);
+    assert(Var.Verify() && E.Verify());
   }
 
   /// Construct a DbgVariable from a DEBUG_VALUE.
   /// AbstractVar may be NULL.
   DbgVariable(const MachineInstr *DbgValue, DwarfDebug *DD)
-      : Var(DbgValue->getDebugVariable()), Expr(DbgValue->getDebugExpression()),
-        TheDIE(nullptr), DotDebugLocOffset(~0U), MInsn(DbgValue),
-        FrameIndex(~0), DD(DD) {}
+      : Var(DbgValue->getDebugVariable()),
+        Expr(1, DbgValue->getDebugExpression()), TheDIE(nullptr),
+        DotDebugLocOffset(~0U), MInsn(DbgValue), DD(DD) {
+    FrameIndex.push_back(~0);
+  }
 
   // Accessors.
   DIVariable getVariable() const { return Var; }
-  DIExpression getExpression() const { return Expr; }
+  const ArrayRef<DIExpression> getExpression() const { return Expr; }
   void setDIE(DIE &D) { TheDIE = &D; }
   DIE *getDIE() const { return TheDIE; }
   void setDotDebugLocOffset(unsigned O) { DotDebugLocOffset = O; }
   unsigned getDotDebugLocOffset() const { return DotDebugLocOffset; }
   StringRef getName() const { return Var.getName(); }
   const MachineInstr *getMInsn() const { return MInsn; }
-  int getFrameIndex() const { return FrameIndex; }
-  void setFrameIndex(int FI) { FrameIndex = FI; }
+  const ArrayRef<int> getFrameIndex() const { return FrameIndex; }
+
+  void addMMIEntry(const DbgVariable &V) {
+    assert(  DotDebugLocOffset == ~0U &&   !MInsn && "not an MMI entry");
+    assert(V.DotDebugLocOffset == ~0U && !V.MInsn && "not an MMI entry");
+    assert(V.Var == Var && "conflicting DIVariable");
+
+    if (V.getFrameIndex().back() != ~0) {
+      auto E = V.getExpression();
+      auto FI = V.getFrameIndex();
+      Expr.append(E.begin(), E.end());
+      FrameIndex.append(FI.begin(), FI.end());
+    }
+    assert(Expr.size() > 1
+           ? std::all_of(Expr.begin(), Expr.end(),
+                         [](DIExpression &E) { return E.isBitPiece(); })
+           : (true && "conflicting locations for variable"));
+  }
+
   // Translate tag to proper Dwarf tag.
   dwarf::Tag getTag() const {
     if (Var.getTag() == dwarf::DW_TAG_arg_variable)
@@ -129,14 +154,11 @@
 
   bool variableHasComplexAddress() const {
     assert(Var.isVariable() && "Invalid complex DbgVariable!");
-    return Expr.getNumElements() > 0;
+    assert(Expr.size() == 1 &&
+           "variableHasComplexAddress() invoked on multi-FI variable");
+    return Expr.back().getNumElements() > 0;
   }
   bool isBlockByrefVariable() const;
-  unsigned getNumAddrElements() const {
-    assert(Var.isVariable() && "Invalid complex DbgVariable!");
-    return Expr.getNumElements();
-  }
-  uint64_t getAddrElement(unsigned i) const { return Expr.getElement(i); }
   DIType getType() const;
 
 private:
@@ -179,10 +201,6 @@
   // Size of each symbol emitted (for those symbols that have a specific size).
   DenseMap<const MCSymbol *, uint64_t> SymSize;
 
-  // Provides a unique id per text section.
-  typedef DenseMap<const MCSection *, SmallVector<SymbolCU, 8> > SectionMapType;
-  SectionMapType SectionMap;
-
   LexicalScopes LScopes;
 
   // Collection of abstract variables.
@@ -259,7 +277,8 @@
   // them.
   DenseMap<const MDNode *, const DwarfTypeUnit *> DwarfTypeUnits;
 
-  SmallVector<std::pair<std::unique_ptr<DwarfTypeUnit>, DICompositeType>, 1> TypeUnitsUnderConstruction;
+  SmallVector<std::pair<std::unique_ptr<DwarfTypeUnit>, DICompositeType>, 1>
+      TypeUnitsUnderConstruction;
 
   // Whether to emit the pubnames/pubtypes sections.
   bool HasDwarfPubSections;
@@ -348,10 +367,6 @@
   /// processed.
   void finalizeModuleInfo();
 
-  /// \brief Emit labels to close any remaining sections that have been left
-  /// open.
-  void endSections();
-
   /// \brief Emit the debug info section.
   void emitDebugInfo();
 
@@ -565,7 +580,8 @@
   void emitDebugLocEntry(ByteStreamer &Streamer, const DebugLocEntry &Entry);
   /// \brief emit a single value for the debug loc section.
   void emitDebugLocValue(ByteStreamer &Streamer,
-                         const DebugLocEntry::Value &Value);
+                         const DebugLocEntry::Value &Value,
+                         unsigned PieceOffsetInBits = 0);
   /// Emits an optimal (=sorted) sequence of DW_OP_pieces.
   void emitLocPieces(ByteStreamer &Streamer,
                      const DITypeIdentifierMap &Map,

diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
new file mode 100644
index 0000000..fcab067
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp

@@ -0,0 +1,269 @@
+//===-- llvm/CodeGen/DwarfExpression.cpp - Dwarf Debug Framework ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf debug info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfExpression.h"
+#include "DwarfDebug.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+const TargetRegisterInfo *DwarfExpression::getTRI() const {
+  return AP.TM.getSubtargetImpl()->getRegisterInfo();
+}
+
+unsigned DwarfExpression::getDwarfVersion() const {
+  return AP.getDwarfDebug()->getDwarfVersion();
+}
+
+void DwarfExpression::AddReg(int DwarfReg, const char *Comment) {
+  assert(DwarfReg >= 0 && "invalid negative dwarf register number");
+  if (DwarfReg < 32) {
+    EmitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment);
+  } else {
+    EmitOp(dwarf::DW_OP_regx, Comment);
+    EmitUnsigned(DwarfReg);
+  }
+}
+
+void DwarfExpression::AddRegIndirect(int DwarfReg, int Offset, bool Deref) {
+  assert(DwarfReg >= 0 && "invalid negative dwarf register number");
+  if (DwarfReg < 32) {
+    EmitOp(dwarf::DW_OP_breg0 + DwarfReg);
+  } else {
+    EmitOp(dwarf::DW_OP_bregx);
+    EmitUnsigned(DwarfReg);
+  }
+  EmitSigned(Offset);
+  if (Deref)
+    EmitOp(dwarf::DW_OP_deref);
+}
+
+void DwarfExpression::AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits) {
+  assert(SizeInBits > 0 && "piece has size zero");
+  const unsigned SizeOfByte = 8;
+  if (OffsetInBits > 0 || SizeInBits % SizeOfByte) {
+    EmitOp(dwarf::DW_OP_bit_piece);
+    EmitUnsigned(SizeInBits);
+    EmitUnsigned(OffsetInBits);
+  } else {
+    EmitOp(dwarf::DW_OP_piece);
+    unsigned ByteSize = SizeInBits / SizeOfByte;
+    EmitUnsigned(ByteSize);
+  }
+}
+
+void DwarfExpression::AddShr(unsigned ShiftBy) {
+  EmitOp(dwarf::DW_OP_constu);
+  EmitUnsigned(ShiftBy);
+  EmitOp(dwarf::DW_OP_shr);
+}
+
+bool DwarfExpression::AddMachineRegIndirect(unsigned MachineReg, int Offset) {
+  int DwarfReg = getTRI()->getDwarfRegNum(MachineReg, false);
+  if (DwarfReg < 0)
+    return false;
+
+  if (isFrameRegister(MachineReg)) {
+    // If variable offset is based in frame register then use fbreg.
+    EmitOp(dwarf::DW_OP_fbreg);
+    EmitSigned(Offset);
+  } else {
+    AddRegIndirect(DwarfReg, Offset);
+  }
+  return true;
+}
+
+bool DwarfExpression::AddMachineRegPiece(unsigned MachineReg,
+                                         unsigned PieceSizeInBits,
+                                         unsigned PieceOffsetInBits) {
+  const TargetRegisterInfo *TRI = getTRI();
+  if (!TRI->isPhysicalRegister(MachineReg))
+    return false;
+
+  int Reg = TRI->getDwarfRegNum(MachineReg, false);
+
+  // If this is a valid register number, emit it.
+  if (Reg >= 0) {
+    AddReg(Reg);
+    if (PieceSizeInBits)
+      AddOpPiece(PieceSizeInBits, PieceOffsetInBits);
+    return true;
+  }
+
+  // Walk up the super-register chain until we find a valid number.
+  // For example, EAX on x86_64 is a 32-bit piece of RAX with offset 0.
+  for (MCSuperRegIterator SR(MachineReg, TRI); SR.isValid(); ++SR) {
+    Reg = TRI->getDwarfRegNum(*SR, false);
+    if (Reg >= 0) {
+      unsigned Idx = TRI->getSubRegIndex(*SR, MachineReg);
+      unsigned Size = TRI->getSubRegIdxSize(Idx);
+      unsigned RegOffset = TRI->getSubRegIdxOffset(Idx);
+      AddReg(Reg, "super-register");
+      if (PieceOffsetInBits == RegOffset) {
+        AddOpPiece(Size, RegOffset);
+      } else {
+        // If this is part of a variable in a sub-register at a
+        // non-zero offset, we need to manually shift the value into
+        // place, since the DW_OP_piece describes the part of the
+        // variable, not the position of the subregister.
+        if (RegOffset)
+          AddShr(RegOffset);
+        AddOpPiece(Size, PieceOffsetInBits);
+      }
+      return true;
+    }
+  }
+
+  // Otherwise, attempt to find a covering set of sub-register numbers.
+  // For example, Q0 on ARM is a composition of D0+D1.
+  //
+  // Keep track of the current position so we can emit the more
+  // efficient DW_OP_piece.
+  unsigned CurPos = PieceOffsetInBits;
+  // The size of the register in bits, assuming 8 bits per byte.
+  unsigned RegSize = TRI->getMinimalPhysRegClass(MachineReg)->getSize() * 8;
+  // Keep track of the bits in the register we already emitted, so we
+  // can avoid emitting redundant aliasing subregs.
+  SmallBitVector Coverage(RegSize, false);
+  for (MCSubRegIterator SR(MachineReg, TRI); SR.isValid(); ++SR) {
+    unsigned Idx = TRI->getSubRegIndex(MachineReg, *SR);
+    unsigned Size = TRI->getSubRegIdxSize(Idx);
+    unsigned Offset = TRI->getSubRegIdxOffset(Idx);
+    Reg = TRI->getDwarfRegNum(*SR, false);
+
+    // Intersection between the bits we already emitted and the bits
+    // covered by this subregister.
+    SmallBitVector Intersection(RegSize, false);
+    Intersection.set(Offset, Offset + Size);
+    Intersection ^= Coverage;
+
+    // If this sub-register has a DWARF number and we haven't covered
+    // its range, emit a DWARF piece for it.
+    if (Reg >= 0 && Intersection.any()) {
+      AddReg(Reg, "sub-register");
+      AddOpPiece(Size, Offset == CurPos ? 0 : Offset);
+      CurPos = Offset + Size;
+
+      // Mark it as emitted.
+      Coverage.set(Offset, Offset + Size);
+    }
+  }
+
+  return CurPos > PieceOffsetInBits;
+}
+
+void DwarfExpression::AddSignedConstant(int Value) {
+  EmitOp(dwarf::DW_OP_consts);
+  EmitSigned(Value);
+  // The proper way to describe a constant value is
+  // DW_OP_constu <const>, DW_OP_stack_value.
+  // Unfortunately, DW_OP_stack_value was not available until DWARF-4,
+  // so we will continue to generate DW_OP_constu <const> for DWARF-2
+  // and DWARF-3. Technically, this is incorrect since DW_OP_const <const>
+  // actually describes a value at a constant addess, not a constant value.
+  // However, in the past there was no better way  to describe a constant
+  // value, so the producers and consumers started to rely on heuristics
+  // to disambiguate the value vs. location status of the expression.
+  // See PR21176 for more details.
+  if (getDwarfVersion() >= 4)
+    EmitOp(dwarf::DW_OP_stack_value);
+}
+
+void DwarfExpression::AddUnsignedConstant(unsigned Value) {
+  EmitOp(dwarf::DW_OP_constu);
+  EmitUnsigned(Value);
+  // cf. comment in DwarfExpression::AddSignedConstant().
+  if (getDwarfVersion() >= 4)
+    EmitOp(dwarf::DW_OP_stack_value);
+}
+
+static unsigned getOffsetOrZero(unsigned OffsetInBits,
+                                unsigned PieceOffsetInBits) {
+  if (OffsetInBits == PieceOffsetInBits)
+    return 0;
+  assert(OffsetInBits >= PieceOffsetInBits && "overlapping pieces");
+  return OffsetInBits;
+}
+
+bool DwarfExpression::AddMachineRegExpression(DIExpression Expr,
+                                              unsigned MachineReg,
+                                              unsigned PieceOffsetInBits) {
+  auto I = Expr.begin();
+  // Pattern-match combinations for which more efficient representations exist
+  // first.
+  if (I == Expr.end())
+    return AddMachineRegPiece(MachineReg);
+
+  bool ValidReg = false;
+  switch (*I) {
+  case dwarf::DW_OP_bit_piece: {
+    unsigned OffsetInBits = I->getArg(1);
+    unsigned SizeInBits   = I->getArg(2);
+    // Piece always comes at the end of the expression.
+    return AddMachineRegPiece(MachineReg, SizeInBits,
+               getOffsetOrZero(OffsetInBits, PieceOffsetInBits));
+  }
+  case dwarf::DW_OP_plus:
+    // [DW_OP_reg,Offset,DW_OP_plus,DW_OP_deref] --> [DW_OP_breg,Offset].
+    if (I->getNext() == dwarf::DW_OP_deref) {
+      unsigned Offset = I->getArg(1);
+      ValidReg = AddMachineRegIndirect(MachineReg, Offset);
+      std::advance(I, 2);
+      break;
+    } else
+      ValidReg = AddMachineRegPiece(MachineReg);
+  case dwarf::DW_OP_deref:
+    // [DW_OP_reg,DW_OP_deref] --> [DW_OP_breg].
+    ValidReg = AddMachineRegIndirect(MachineReg);
+    ++I;
+    break;
+  default:
+    llvm_unreachable("unsupported operand");
+  }
+
+  if (!ValidReg)
+    return false;
+
+  // Emit remaining elements of the expression.
+  AddExpression(I, Expr.end(), PieceOffsetInBits);
+  return true;
+}
+
+void DwarfExpression::AddExpression(DIExpression::iterator I,
+                                    DIExpression::iterator E,
+                                    unsigned PieceOffsetInBits) {
+  for (; I != E; ++I) {
+    switch (*I) {
+    case dwarf::DW_OP_bit_piece: {
+      unsigned OffsetInBits = I->getArg(1);
+      unsigned SizeInBits   = I->getArg(2);
+      AddOpPiece(SizeInBits, getOffsetOrZero(OffsetInBits, PieceOffsetInBits));
+      break;
+    }
+    case dwarf::DW_OP_plus:
+      EmitOp(dwarf::DW_OP_plus_uconst);
+      EmitUnsigned(I->getArg(1));
+      break;
+    case dwarf::DW_OP_deref:
+      EmitOp(dwarf::DW_OP_deref);
+      break;
+    default:
+      llvm_unreachable("unhandled opcode found in DIExpression");
+    }
+  }
+}

diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.h b/lib/CodeGen/AsmPrinter/DwarfExpression.h
new file mode 100644
index 0000000..b90b7b6
--- /dev/null
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.h

@@ -0,0 +1,133 @@
+//===-- llvm/CodeGen/DwarfExpression.h - Dwarf Compile Unit ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXPRESSION_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXPRESSION_H
+
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class AsmPrinter;
+class ByteStreamer;
+class TargetRegisterInfo;
+class DwarfUnit;
+class DIELoc;
+
+/// Base class containing the logic for constructing DWARF expressions
+/// independently of whether they are emitted into a DIE or into a .debug_loc
+/// entry.
+class DwarfExpression {
+protected:
+  const AsmPrinter &AP;
+  // Various convenience accessors that extract things out of AsmPrinter.
+  const TargetRegisterInfo *getTRI() const;
+  unsigned getDwarfVersion() const;
+
+public:
+  DwarfExpression(const AsmPrinter &AP) : AP(AP) {}
+  virtual ~DwarfExpression() {}
+
+  /// Output a dwarf operand and an optional assembler comment.
+  virtual void EmitOp(uint8_t Op, const char *Comment = nullptr) = 0;
+  /// Emit a raw signed value.
+  virtual void EmitSigned(int Value) = 0;
+  /// Emit a raw unsigned value.
+  virtual void EmitUnsigned(unsigned Value) = 0;
+  /// Return whether the given machine register is the frame register in the
+  /// current function.
+  virtual bool isFrameRegister(unsigned MachineReg) = 0;
+
+  /// Emit a dwarf register operation.
+  void AddReg(int DwarfReg, const char *Comment = nullptr);
+  /// Emit an (double-)indirect dwarf register operation.
+  void AddRegIndirect(int DwarfReg, int Offset, bool Deref = false);
+
+  /// Emit a dwarf register operation for describing
+  /// - a small value occupying only part of a register or
+  /// - a register representing only part of a value.
+  void AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0);
+  /// Emit a shift-right dwarf expression.
+  void AddShr(unsigned ShiftBy);
+
+  /// Emit an indirect dwarf register operation for the given machine register.
+  /// \return false if no DWARF register exists for MachineReg.
+  bool AddMachineRegIndirect(unsigned MachineReg, int Offset = 0);
+
+  /// \brief Emit a partial DWARF register operation.
+  /// \param MachineReg        the register
+  /// \param PieceSizeInBits   size and
+  /// \param PieceOffsetInBits offset of the piece in bits, if this is one
+  ///                          piece of an aggregate value.
+  ///
+  /// If size and offset is zero an operation for the entire
+  /// register is emitted: Some targets do not provide a DWARF
+  /// register number for every register.  If this is the case, this
+  /// function will attempt to emit a DWARF register by emitting a
+  /// piece of a super-register or by piecing together multiple
+  /// subregisters that alias the register.
+  ///
+  /// \return false if no DWARF register exists for MachineReg.
+  bool AddMachineRegPiece(unsigned MachineReg, unsigned PieceSizeInBits = 0,
+                          unsigned PieceOffsetInBits = 0);
+
+  /// Emit a signed constant.
+  void AddSignedConstant(int Value);
+  /// Emit an unsigned constant.
+  void AddUnsignedConstant(unsigned Value);
+
+  /// Emit an entire DIExpression on top of a machine register location.
+  /// \param PieceOffsetInBits If this is one piece out of a fragmented
+  /// location, this is the offset of the piece inside the entire variable.
+  /// \return false if no DWARF register exists for MachineReg.
+  bool AddMachineRegExpression(DIExpression Expr, unsigned MachineReg,
+                               unsigned PieceOffsetInBits = 0);
+  /// Emit a the operations remaining the DIExpressionIterator I.
+  /// \param PieceOffsetInBits If this is one piece out of a fragmented
+  /// location, this is the offset of the piece inside the entire variable.
+  void AddExpression(DIExpression::iterator I, DIExpression::iterator E,
+                     unsigned PieceOffsetInBits = 0);
+};
+
+/// DwarfExpression implementation for .debug_loc entries.
+class DebugLocDwarfExpression : public DwarfExpression {
+  ByteStreamer &BS;
+
+public:
+  DebugLocDwarfExpression(const AsmPrinter &AP, ByteStreamer &BS)
+      : DwarfExpression(AP), BS(BS) {}
+
+  void EmitOp(uint8_t Op, const char *Comment = nullptr) override;
+  void EmitSigned(int Value) override;
+  void EmitUnsigned(unsigned Value) override;
+  bool isFrameRegister(unsigned MachineReg) override;
+};
+
+/// DwarfExpression implementation for singular DW_AT_location.
+class DIEDwarfExpression : public DwarfExpression {
+  DwarfUnit &DU;
+  DIELoc &DIE;
+
+public:
+  DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, DIELoc &DIE)
+      : DwarfExpression(AP), DU(DU), DIE(DIE) {}
+
+  void EmitOp(uint8_t Op, const char *Comment = nullptr) override;
+  void EmitSigned(int Value) override;
+  void EmitUnsigned(unsigned Value) override;
+  bool isFrameRegister(unsigned MachineReg) override;
+};
+}
+
+#endif

diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/lib/CodeGen/AsmPrinter/DwarfFile.cpp
index 50180ea..3988f0d 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.cpp

@@ -8,13 +8,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "DwarfFile.h"
-
 #include "DwarfDebug.h"
 #include "DwarfUnit.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 
 namespace llvm {
@@ -147,7 +146,7 @@
   StrPool.emit(*Asm, StrSection, OffsetSection);
 }
 
-void DwarfFile::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
+bool DwarfFile::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
   SmallVectorImpl<DbgVariable *> &Vars = ScopeVariables[LS];
   DIVariable DV = Var->getVariable();
   // Variables with positive arg numbers are parameters.
@@ -169,18 +168,17 @@
       // A later indexed parameter has been found, insert immediately before it.
       if (CurNum > ArgNum)
         break;
-      // FIXME: There are still some cases where two inlined functions are
-      // conflated together (two calls to the same function at the same
-      // location (eg: via a macro, or without column info, etc)) and then
-      // their arguments are conflated as well.
-      assert((LS->getParent() || CurNum != ArgNum) &&
-             "Duplicate argument for top level (non-inlined) function");
+      if (CurNum == ArgNum) {
+        (*I)->addMMIEntry(*Var);
+        return false;
+      }
       ++I;
     }
     Vars.insert(I, Var);
-    return;
+    return true;
   }
 
   Vars.push_back(Var);
+  return true;
 }
 }

diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index 9d64bfc..35bf33a 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h

@@ -10,17 +10,16 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFFILE_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFFILE_H
 
+#include "AddressPool.h"
+#include "DwarfStringPool.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/Allocator.h"
-#include "AddressPool.h"
-#include "DwarfStringPool.h"
-
-#include <vector>
-#include <string>
 #include <memory>
+#include <string>
+#include <vector>
 
 namespace llvm {
 class AsmPrinter;
@@ -96,7 +95,8 @@
   /// \brief Returns the string pool.
   DwarfStringPool &getStringPool() { return StrPool; }
 
-  void addScopeVariable(LexicalScope *LS, DbgVariable *Var);
+  /// \returns false if the variable was merged with a previous one.
+  bool addScopeVariable(LexicalScope *LS, DbgVariable *Var);
 
   DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8>> &getScopeVariables() {
     return ScopeVariables;

diff --git a/lib/CodeGen/AsmPrinter/DwarfStringPool.h b/lib/CodeGen/AsmPrinter/DwarfStringPool.h
index ab32c1b..63e3412 100644
--- a/lib/CodeGen/AsmPrinter/DwarfStringPool.h
+++ b/lib/CodeGen/AsmPrinter/DwarfStringPool.h

@@ -13,7 +13,6 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/Support/Allocator.h"
-
 #include <utility>
 
 namespace llvm {

diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 919d9d2..b0c7d48 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp

@@ -12,10 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "DwarfUnit.h"
-
 #include "DwarfAccelTable.h"
 #include "DwarfCompileUnit.h"
 #include "DwarfDebug.h"
+#include "DwarfExpression.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
@@ -43,6 +43,20 @@
                        cl::desc("Generate DWARF4 type units."),
                        cl::init(false));
 
+void DIEDwarfExpression::EmitOp(uint8_t Op, const char* Comment) {
+  DU.addUInt(DIE, dwarf::DW_FORM_data1, Op);
+}
+void DIEDwarfExpression::EmitSigned(int Value) {
+  DU.addSInt(DIE, dwarf::DW_FORM_sdata, Value);
+}
+void DIEDwarfExpression::EmitUnsigned(unsigned Value) {
+  DU.addUInt(DIE, dwarf::DW_FORM_udata, Value);
+}
+bool DIEDwarfExpression::isFrameRegister(unsigned MachineReg) {
+  return MachineReg == getTRI()->getFrameRegister(*AP.MF);
+}
+
+
 /// Unit - Unit constructor.
 DwarfUnit::DwarfUnit(unsigned UID, dwarf::Tag UnitTag, DICompileUnit Node,
                      AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU)
@@ -116,6 +130,30 @@
     if (dwarf::DWARF_VERSION >= 4)
       return 1;
     break;
+
+  // The languages below have valid values only if the DWARF version >= 5.
+  case dwarf::DW_LANG_OpenCL:
+  case dwarf::DW_LANG_Go:
+  case dwarf::DW_LANG_Haskell:
+  case dwarf::DW_LANG_C_plus_plus_03:
+  case dwarf::DW_LANG_C_plus_plus_11:
+  case dwarf::DW_LANG_OCaml:
+  case dwarf::DW_LANG_Rust:
+  case dwarf::DW_LANG_C11:
+  case dwarf::DW_LANG_Swift:
+  case dwarf::DW_LANG_Dylan:
+  case dwarf::DW_LANG_C_plus_plus_14:
+    if (dwarf::DWARF_VERSION >= 5)
+      return 0;
+    break;
+
+  case dwarf::DW_LANG_Modula3:
+  case dwarf::DW_LANG_Julia:
+  case dwarf::DW_LANG_Fortran03:
+  case dwarf::DW_LANG_Fortran08:
+    if (dwarf::DWARF_VERSION >= 5)
+      return 1;
+    break;
   }
 
   return -1;
@@ -399,85 +437,18 @@
 }
 
 /// addRegisterOp - Add register operand.
-// FIXME: Ideally, this would share the implementation with
-// AsmPrinter::EmitDwarfRegOpPiece.
-void DwarfUnit::addRegisterOpPiece(DIELoc &TheDie, unsigned Reg,
+bool DwarfUnit::addRegisterOpPiece(DIELoc &TheDie, unsigned Reg,
                                    unsigned SizeInBits, unsigned OffsetInBits) {
-  const TargetRegisterInfo *RI = Asm->TM.getSubtargetImpl()->getRegisterInfo();
-  int DWReg = RI->getDwarfRegNum(Reg, false);
-  bool isSubRegister = DWReg < 0;
-
-  unsigned Idx = 0;
-
-  // Go up the super-register chain until we hit a valid dwarf register number.
-  for (MCSuperRegIterator SR(Reg, RI); SR.isValid() && DWReg < 0; ++SR) {
-    DWReg = RI->getDwarfRegNum(*SR, false);
-    if (DWReg >= 0)
-      Idx = RI->getSubRegIndex(*SR, Reg);
-  }
-
-  if (DWReg < 0) {
-    DEBUG(dbgs() << "Invalid Dwarf register number.\n");
-    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_nop);
-    return;
-  }
-
-  // Emit register.
-  if (DWReg < 32)
-    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + DWReg);
-  else {
-    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
-    addUInt(TheDie, dwarf::DW_FORM_udata, DWReg);
-  }
-
-  // Emit mask.
-  bool isPiece = SizeInBits > 0;
-  if (isSubRegister || isPiece) {
-    const unsigned SizeOfByte = 8;
-    unsigned RegSizeInBits = RI->getSubRegIdxSize(Idx);
-    unsigned RegOffsetInBits = RI->getSubRegIdxOffset(Idx);
-    unsigned PieceSizeInBits = std::max(SizeInBits, RegSizeInBits);
-    unsigned PieceOffsetInBits = OffsetInBits ? OffsetInBits : RegOffsetInBits;
-    assert(RegSizeInBits >= SizeInBits && "register smaller than value");
-
-    if (RegOffsetInBits != PieceOffsetInBits) {
-      // Manually shift the value into place, since the DW_OP_piece
-      // describes the part of the variable, not the position of the
-      // subregister.
-      addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
-      addUInt(TheDie, dwarf::DW_FORM_data1, RegOffsetInBits);
-      addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_shr);
-    }
-
-    if (PieceOffsetInBits > 0 || PieceSizeInBits % SizeOfByte) {
-      assert(PieceSizeInBits > 0 && "piece has zero size");
-      addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_bit_piece);
-      addUInt(TheDie, dwarf::DW_FORM_data1, PieceSizeInBits);
-      addUInt(TheDie, dwarf::DW_FORM_data1, PieceOffsetInBits);
-     } else {
-      assert(PieceSizeInBits > 0 && "piece has zero size");
-      addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_piece);
-      addUInt(TheDie, dwarf::DW_FORM_data1, PieceSizeInBits/SizeOfByte);
-    }
-  }
+  DIEDwarfExpression Expr(*Asm, *this, TheDie);
+  Expr.AddMachineRegPiece(Reg, SizeInBits, OffsetInBits);
+  return true;
 }
 
 /// addRegisterOffset - Add register offset.
-void DwarfUnit::addRegisterOffset(DIELoc &TheDie, unsigned Reg,
+bool DwarfUnit::addRegisterOffset(DIELoc &TheDie, unsigned Reg,
                                   int64_t Offset) {
-  const TargetRegisterInfo *RI = Asm->TM.getSubtargetImpl()->getRegisterInfo();
-  unsigned DWReg = RI->getDwarfRegNum(Reg, false);
-  const TargetRegisterInfo *TRI = Asm->TM.getSubtargetImpl()->getRegisterInfo();
-  if (Reg == TRI->getFrameRegister(*Asm->MF))
-    // If variable offset is based in frame register then use fbreg.
-    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_fbreg);
-  else if (DWReg < 32)
-    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + DWReg);
-  else {
-    addUInt(TheDie, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
-    addUInt(TheDie, dwarf::DW_FORM_udata, DWReg);
-  }
-  addSInt(TheDie, dwarf::DW_FORM_sdata, Offset);
+  DIEDwarfExpression Expr(*Asm, *this, TheDie);
+  return Expr.AddMachineRegIndirect(Reg, Offset);
 }
 
 /* Byref variables, in Blocks, are declared by the programmer as "SomeType
@@ -581,10 +552,14 @@
   // variable's location.
   DIELoc *Loc = new (DIEValueAllocator) DIELoc();
 
+  bool validReg;
   if (Location.isReg())
-    addRegisterOpPiece(*Loc, Location.getReg());
+    validReg = addRegisterOpPiece(*Loc, Location.getReg());
   else
-    addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
+    validReg = addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
+
+  if (!validReg)
+    return;
 
   // If we started with a pointer to the __Block_byref... struct, then
   // the first thing we need to do is dereference the pointer (DW_OP_deref).
@@ -622,13 +597,19 @@
     dwarf::Tag T = (dwarf::Tag)Ty.getTag();
     // Encode pointer constants as unsigned bytes. This is used at least for
     // null pointer constant emission.
+    // (Pieces of) aggregate types that get hacked apart by SROA may also be
+    // represented by a constant. Encode them as unsigned bytes.
     // FIXME: reference and rvalue_reference /probably/ shouldn't be allowed
     // here, but accept them for now due to a bug in SROA producing bogus
     // dbg.values.
-    if (T == dwarf::DW_TAG_pointer_type ||
+    if (T == dwarf::DW_TAG_array_type ||
+        T == dwarf::DW_TAG_class_type ||
+        T == dwarf::DW_TAG_pointer_type ||
         T == dwarf::DW_TAG_ptr_to_member_type ||
         T == dwarf::DW_TAG_reference_type ||
-        T == dwarf::DW_TAG_rvalue_reference_type)
+        T == dwarf::DW_TAG_rvalue_reference_type ||
+        T == dwarf::DW_TAG_structure_type ||
+        T == dwarf::DW_TAG_union_type)
       return true;
     assert(T == dwarf::DW_TAG_typedef || T == dwarf::DW_TAG_const_type ||
            T == dwarf::DW_TAG_volatile_type ||
@@ -649,11 +630,15 @@
           Encoding == dwarf::DW_ATE_unsigned_char ||
           Encoding == dwarf::DW_ATE_signed ||
           Encoding == dwarf::DW_ATE_signed_char ||
-          Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean) &&
+          Encoding == dwarf::DW_ATE_float ||
+          Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean ||
+          (Ty.getTag() == dwarf::DW_TAG_unspecified_type &&
+           Ty.getName() == "decltype(nullptr)")) &&
          "Unsupported encoding");
   return (Encoding == dwarf::DW_ATE_unsigned ||
           Encoding == dwarf::DW_ATE_unsigned_char ||
-          Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean);
+          Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean ||
+          Ty.getTag() == dwarf::DW_TAG_unspecified_type);
 }
 
 /// If this type is derived from a base type then return base type size.
@@ -667,10 +652,7 @@
 
   DIType BaseType = DD->resolve(Ty.getTypeDerivedFrom());
 
-  // If this type is not derived from any type or the type is a declaration then
-  // take conservative approach.
-  if (!BaseType.isValid() || BaseType.isForwardDecl())
-    return Ty.getSizeInBits();
+  assert(BaseType.isValid() && "Unexpected invalid base type");
 
   // If this is a derived type, go ahead and get the base type, unless it's a
   // reference then it's just the size of the field. Pointer types have no need
@@ -977,7 +959,8 @@
     addString(Buffer, dwarf::DW_AT_name, Name);
 
   // Add size if non-zero (derived types might be zero-sized.)
-  if (Size && Tag != dwarf::DW_TAG_pointer_type)
+  if (Size && Tag != dwarf::DW_TAG_pointer_type
+           && Tag != dwarf::DW_TAG_ptr_to_member_type)
     addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size);
 
   if (Tag == dwarf::DW_TAG_ptr_to_member_type)
@@ -1110,6 +1093,8 @@
     if (CTy.isAppleBlockExtension())
       addFlag(Buffer, dwarf::DW_AT_APPLE_block);
 
+    // This is outside the DWARF spec, but GDB expects a DW_AT_containing_type
+    // inside C++ composite types to point to the base class with the vtable.
     DICompositeType ContainingType(resolve(CTy.getContainingType()));
     if (ContainingType)
       addDIEEntry(Buffer, dwarf::DW_AT_containing_type,
@@ -1187,10 +1172,10 @@
     addType(ParamDIE, resolve(VP.getType()));
   if (!VP.getName().empty())
     addString(ParamDIE, dwarf::DW_AT_name, VP.getName());
-  if (Value *Val = VP.getValue()) {
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(Val))
+  if (Metadata *Val = VP.getValue()) {
+    if (ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Val))
       addConstantValue(ParamDIE, CI, resolve(VP.getType()));
-    else if (GlobalValue *GV = dyn_cast<GlobalValue>(Val)) {
+    else if (GlobalValue *GV = mdconst::dyn_extract<GlobalValue>(Val)) {
       // For declaration non-type template parameters (such as global values and
       // functions)
       DIELoc *Loc = new (DIEValueAllocator) DIELoc();
@@ -1359,7 +1344,7 @@
   if (SP.isOptimized())
     addFlag(SPDie, dwarf::DW_AT_APPLE_optimized);
 
-  if (unsigned isa = Asm->getISAEncoding()) {
+  if (unsigned isa = Asm->getISAEncoding(SP.getFunction())) {
     addUInt(SPDie, dwarf::DW_AT_APPLE_isa, dwarf::DW_FORM_flag, isa);
   }
 
@@ -1511,7 +1496,7 @@
     uint64_t FieldSize = getBaseTypeSize(DD, DT);
     uint64_t OffsetInBytes;
 
-    if (Size != FieldSize) {
+    if (FieldSize && Size != FieldSize) {
       // Handle bitfield, assume bytes are 8 bits.
       addUInt(MemberDie, dwarf::DW_AT_byte_size, None, FieldSize/8);
       addUInt(MemberDie, dwarf::DW_AT_bit_size, None, Size);

diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index f40c937..7a5e47d 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h

@@ -14,17 +14,17 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFUNIT_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFUNIT_H
 
-#include "DIE.h"
 #include "DwarfDebug.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCDwarf.h"
 
 namespace llvm {
 
@@ -138,6 +138,7 @@
   }
 
   // Accessors.
+  AsmPrinter* getAsmPrinter() const { return Asm; }
   unsigned getUniqueID() const { return UniqueID; }
   uint16_t getLanguage() const { return CUNode.getLanguage(); }
   DICompileUnit getCUNode() const { return CUNode; }
@@ -253,12 +254,16 @@
   /// addTemplateParams - Add template parameters in buffer.
   void addTemplateParams(DIE &Buffer, DIArray TParams);
 
-  /// addRegisterOp - Add register operand.
-  void addRegisterOpPiece(DIELoc &TheDie, unsigned Reg,
+  /// \brief Add register operand.
+  /// \returns false if the register does not exist, e.g., because it was never
+  /// materialized.
+  bool addRegisterOpPiece(DIELoc &TheDie, unsigned Reg,
                           unsigned SizeInBits = 0, unsigned OffsetInBits = 0);
 
-  /// addRegisterOffset - Add register offset.
-  void addRegisterOffset(DIELoc &TheDie, unsigned Reg, int64_t Offset);
+  /// \brief Add register offset.
+  /// \returns false if the register does not exist, e.g., because it was never
+  /// materialized.
+  bool addRegisterOffset(DIELoc &TheDie, unsigned Reg, int64_t Offset);
 
   // FIXME: Should be reformulated in terms of addComplexAddress.
   /// addBlockByrefAddress - Start with the address based on the location

diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index 2bbffb3..4841814 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.cpp

@@ -121,7 +121,8 @@
       for (unsigned J = NumShared, M = TypeIds.size(); J != M; ++J) {
         int TypeID = TypeIds[J];
         assert(-1 - TypeID < (int)FilterOffsets.size() && "Unknown filter id!");
-        int ValueForTypeID = TypeID < 0 ? FilterOffsets[-1 - TypeID] : TypeID;
+        int ValueForTypeID =
+            isFilterEHSelector(TypeID) ? FilterOffsets[-1 - TypeID] : TypeID;
         unsigned SizeTypeID = getSLEB128Size(ValueForTypeID);
 
         int NextAction = SizeAction ? -(SizeAction + SizeTypeID) : 0;
@@ -195,9 +196,22 @@
 /// table.  Entries must be ordered by try-range address.
 void EHStreamer::
 computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
-                     const RangeMapType &PadMap,
                      const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
                      const SmallVectorImpl<unsigned> &FirstActions) {
+  // Invokes and nounwind calls have entries in PadMap (due to being bracketed
+  // by try-range labels when lowered).  Ordinary calls do not, so appropriate
+  // try-ranges for them need be deduced so we can put them in the LSDA.
+  RangeMapType PadMap;
+  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
+    const LandingPadInfo *LandingPad = LandingPads[i];
+    for (unsigned j = 0, E = LandingPad->BeginLabels.size(); j != E; ++j) {
+      MCSymbol *BeginLabel = LandingPad->BeginLabels[j];
+      assert(!PadMap.count(BeginLabel) && "Duplicate landing pad labels!");
+      PadRange P = { i, j };
+      PadMap[BeginLabel] = P;
+    }
+  }
+
   // The end label of the previous invoke or nounwind try-range.
   MCSymbol *LastLabel = nullptr;
 
@@ -208,6 +222,8 @@
   // Whether the last CallSite entry was for an invoke.
   bool PreviousIsInvoke = false;
 
+  bool IsSJLJ = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::SjLj;
+
   // Visit all instructions in order of address.
   for (const auto &MBB : *Asm->MF) {
     for (const auto &MI : MBB) {
@@ -237,7 +253,7 @@
       // instruction between the previous try-range and this one may throw,
       // create a call-site entry with no landing pad for the region between the
       // try-ranges.
-      if (SawPotentiallyThrowing && Asm->MAI->usesItaniumLSDAForExceptions()) {
+      if (SawPotentiallyThrowing && Asm->MAI->usesCFIForEH()) {
         CallSiteEntry Site = { LastLabel, BeginLabel, nullptr, 0 };
         CallSites.push_back(Site);
         PreviousIsInvoke = false;
@@ -254,14 +270,14 @@
         CallSiteEntry Site = {
           BeginLabel,
           LastLabel,
-          LandingPad->LandingPadLabel,
+          LandingPad,
           FirstActions[P.PadIndex]
         };
 
         // Try to merge with the previous call-site. SJLJ doesn't do this
-        if (PreviousIsInvoke && Asm->MAI->usesItaniumLSDAForExceptions()) {
+        if (PreviousIsInvoke && !IsSJLJ) {
           CallSiteEntry &Prev = CallSites.back();
-          if (Site.PadLabel == Prev.PadLabel && Site.Action == Prev.Action) {
+          if (Site.LPad == Prev.LPad && Site.Action == Prev.Action) {
             // Extend the range of the previous entry.
             Prev.EndLabel = Site.EndLabel;
             continue;
@@ -269,7 +285,7 @@
         }
 
         // Otherwise, create a new call-site.
-        if (Asm->MAI->usesItaniumLSDAForExceptions())
+        if (!IsSJLJ)
           CallSites.push_back(Site);
         else {
           // SjLj EH must maintain the call sites in the order assigned
@@ -287,7 +303,7 @@
   // If some instruction between the previous try-range and the end of the
   // function may throw, create a call-site entry with no landing pad for the
   // region following the try-range.
-  if (SawPotentiallyThrowing && Asm->MAI->usesItaniumLSDAForExceptions()) {
+  if (SawPotentiallyThrowing && !IsSJLJ) {
     CallSiteEntry Site = { LastLabel, nullptr, nullptr, 0 };
     CallSites.push_back(Site);
   }
@@ -338,23 +354,9 @@
   unsigned SizeActions =
     computeActionsTable(LandingPads, Actions, FirstActions);
 
-  // Invokes and nounwind calls have entries in PadMap (due to being bracketed
-  // by try-range labels when lowered).  Ordinary calls do not, so appropriate
-  // try-ranges for them need be deduced when using DWARF exception handling.
-  RangeMapType PadMap;
-  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
-    const LandingPadInfo *LandingPad = LandingPads[i];
-    for (unsigned j = 0, E = LandingPad->BeginLabels.size(); j != E; ++j) {
-      MCSymbol *BeginLabel = LandingPad->BeginLabels[j];
-      assert(!PadMap.count(BeginLabel) && "Duplicate landing pad labels!");
-      PadRange P = { i, j };
-      PadMap[BeginLabel] = P;
-    }
-  }
-
   // Compute the call-site table.
   SmallVector<CallSiteEntry, 64> CallSites;
-  computeCallSiteTable(CallSites, PadMap, LandingPads, FirstActions);
+  computeCallSiteTable(CallSites, LandingPads, FirstActions);
 
   // Final tallies.
 
@@ -519,8 +521,7 @@
       Asm->EmitULEB128(S.Action);
     }
   } else {
-    // DWARF Exception handling
-    assert(Asm->MAI->usesItaniumLSDAForExceptions());
+    // Itanium LSDA exception handling
 
     // The call-site table is a list of all call sites that may throw an
     // exception (including C++ 'throw' statements) in the procedure
@@ -576,15 +577,15 @@
 
       // Offset of the landing pad, counted in 16-byte bundles relative to the
       // @LPStart address.
-      if (!S.PadLabel) {
+      if (!S.LPad) {
         if (VerboseAsm)
           Asm->OutStreamer.AddComment("    has no landing pad");
         Asm->OutStreamer.EmitIntValue(0, 4/*size*/);
       } else {
         if (VerboseAsm)
           Asm->OutStreamer.AddComment(Twine("    jumps to ") +
-                                      S.PadLabel->getName());
-        Asm->EmitLabelDifference(S.PadLabel, EHFuncBeginSym, 4);
+                                      S.LPad->LandingPadLabel->getName());
+        Asm->EmitLabelDifference(S.LPad->LandingPadLabel, EHFuncBeginSym, 4);
       }
 
       // Offset of the first associated action record, relative to the start of
@@ -681,7 +682,7 @@
     unsigned TypeID = *I;
     if (VerboseAsm) {
       --Entry;
-      if (TypeID != 0)
+      if (isFilterEHSelector(TypeID))
         Asm->OutStreamer.AddComment("FilterInfo " + Twine(Entry));
     }
 

diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.h b/lib/CodeGen/AsmPrinter/EHStreamer.h
index 7e9549d..9b316ff 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.h
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.h

@@ -23,6 +23,8 @@
 class MachineInstr;
 class MachineFunction;
 class AsmPrinter;
+class MCSymbol;
+class MCSymbolRefExpr;
 
 template <typename T>
 class SmallVectorImpl;
@@ -60,11 +62,11 @@
   /// Structure describing an entry in the call-site table.
   struct CallSiteEntry {
     // The 'try-range' is BeginLabel .. EndLabel.
-    MCSymbol *BeginLabel; // zero indicates the start of the function.
-    MCSymbol *EndLabel;   // zero indicates the end of the function.
+    MCSymbol *BeginLabel; // Null indicates the start of the function.
+    MCSymbol *EndLabel;   // Null indicates the end of the function.
 
-    // The landing pad starts at PadLabel.
-    MCSymbol *PadLabel;   // zero indicates that there is no landing pad.
+    // LPad contains the landing pad start labels.
+    const LandingPadInfo *LPad; // Null indicates that there is no landing pad.
     unsigned Action;
   };
 
@@ -86,7 +88,6 @@
   /// form gaps in the table.  Entries must be ordered by try-range address.
 
   void computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
-                            const RangeMapType &PadMap,
                             const SmallVectorImpl<const LandingPadInfo *> &LPs,
                             const SmallVectorImpl<unsigned> &FirstActions);
 
@@ -113,6 +114,13 @@
 
   virtual void emitTypeInfos(unsigned TTypeEncoding);
 
+  // Helpers for for identifying what kind of clause an EH typeid or selector
+  // corresponds to. Negative selectors are for filter clauses, the zero
+  // selector is for cleanups, and positive selectors are for catch clauses.
+  static bool isFilterEHSelector(int Selector) { return Selector < 0; }
+  static bool isCleanupEHSelector(int Selector) { return Selector == 0; }
+  static bool isCatchEHSelector(int Selector) { return Selector > 0; }
+
 public:
   EHStreamer(AsmPrinter *A);
   virtual ~EHStreamer();

diff --git a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
index 5bda5a9..97a3234 100644
--- a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp

@@ -34,35 +34,35 @@
 
 namespace {
 
-  class ErlangGCPrinter : public GCMetadataPrinter {
-  public:
-    void beginAssembly(AsmPrinter &AP) override;
-    void finishAssembly(AsmPrinter &AP) override;
-  };
-
+class ErlangGCPrinter : public GCMetadataPrinter {
+public:
+  void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override;
+};
 }
 
 static GCMetadataPrinterRegistry::Add<ErlangGCPrinter>
-X("erlang", "erlang-compatible garbage collector");
+    X("erlang", "erlang-compatible garbage collector");
 
-void llvm::linkErlangGCPrinter() { }
+void llvm::linkErlangGCPrinter() {}
 
-void ErlangGCPrinter::beginAssembly(AsmPrinter &AP) { }
-
-void ErlangGCPrinter::finishAssembly(AsmPrinter &AP) {
+void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
+                                     AsmPrinter &AP) {
   MCStreamer &OS = AP.OutStreamer;
-  unsigned IntPtrSize =
-      AP.TM.getSubtargetImpl()->getDataLayout()->getPointerSize();
+  unsigned IntPtrSize = AP.TM.getDataLayout()->getPointerSize();
 
   // Put this in a custom .note section.
-  AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getContext()
-    .getELFSection(".note.gc", ELF::SHT_PROGBITS, 0,
-                   SectionKind::getDataRel()));
+  AP.OutStreamer.SwitchSection(
+      AP.getObjFileLowering().getContext().getELFSection(".note.gc",
+                                                         ELF::SHT_PROGBITS, 0));
 
   // For each function...
-  for (iterator FI = begin(), FE = end(); FI != FE; ++FI) {
+  for (GCModuleInfo::FuncInfoVec::iterator FI = Info.funcinfo_begin(),
+                                           IE = Info.funcinfo_end();
+       FI != IE; ++FI) {
     GCFunctionInfo &MD = **FI;
-
+    if (MD.getStrategy().getName() != getStrategy().getName())
+      // this function is managed by some other GC
+      continue;
     /** A compact GC layout. Emit this data structure:
      *
      * struct {
@@ -88,7 +88,7 @@
       // Emit the address of the safe point.
       OS.AddComment("safe point address");
       MCSymbol *Label = PI->Label;
-      AP.EmitLabelPlusOffset(Label/*Hi*/, 0/*Offset*/, 4/*Size*/);
+      AP.EmitLabelPlusOffset(Label /*Hi*/, 0 /*Offset*/, 4 /*Size*/);
     }
 
     // Stack information never change in safe points! Only print info from the
@@ -101,8 +101,9 @@
 
     // Emit stack arity, i.e. the number of stacked arguments.
     unsigned RegisteredArgs = IntPtrSize == 4 ? 5 : 6;
-    unsigned StackArity = MD.getFunction().arg_size() > RegisteredArgs ?
-                          MD.getFunction().arg_size() - RegisteredArgs : 0;
+    unsigned StackArity = MD.getFunction().arg_size() > RegisteredArgs
+                              ? MD.getFunction().arg_size() - RegisteredArgs
+                              : 0;
     OS.AddComment("stack arity");
     AP.EmitInt16(StackArity);
 
@@ -113,7 +114,7 @@
     // And for each live root...
     for (GCFunctionInfo::live_iterator LI = MD.live_begin(PI),
                                        LE = MD.live_end(PI);
-                                       LI != LE; ++LI) {
+         LI != LE; ++LI) {
       // Emit live root's offset within the stack frame.
       OS.AddComment("stack index (offset / wordsize)");
       AP.EmitInt16(LI->StackOffset / IntPtrSize);

diff --git a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
index 6480d048..76d6a06 100644
--- a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp

@@ -32,18 +32,17 @@
 
 namespace {
 
-  class OcamlGCMetadataPrinter : public GCMetadataPrinter {
-  public:
-    void beginAssembly(AsmPrinter &AP) override;
-    void finishAssembly(AsmPrinter &AP) override;
-  };
-
+class OcamlGCMetadataPrinter : public GCMetadataPrinter {
+public:
+  void beginAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override;
+  void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override;
+};
 }
 
 static GCMetadataPrinterRegistry::Add<OcamlGCMetadataPrinter>
-Y("ocaml", "ocaml 3.10-compatible collector");
+    Y("ocaml", "ocaml 3.10-compatible collector");
 
-void llvm::linkOcamlGCPrinter() { }
+void llvm::linkOcamlGCPrinter() {}
 
 static void EmitCamlGlobal(const Module &M, AsmPrinter &AP, const char *Id) {
   const std::string &MId = M.getModuleIdentifier();
@@ -67,12 +66,13 @@
   AP.OutStreamer.EmitLabel(Sym);
 }
 
-void OcamlGCMetadataPrinter::beginAssembly(AsmPrinter &AP) {
+void OcamlGCMetadataPrinter::beginAssembly(Module &M, GCModuleInfo &Info,
+                                           AsmPrinter &AP) {
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getTextSection());
-  EmitCamlGlobal(getModule(), AP, "code_begin");
+  EmitCamlGlobal(M, AP, "code_begin");
 
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getDataSection());
-  EmitCamlGlobal(getModule(), AP, "data_begin");
+  EmitCamlGlobal(M, AP, "data_begin");
 }
 
 /// emitAssembly - Print the frametable. The ocaml frametable format is thus:
@@ -91,47 +91,59 @@
 /// (FrameSize and LiveOffsets would overflow). FrameTablePrinter will abort if
 /// either condition is detected in a function which uses the GC.
 ///
-void OcamlGCMetadataPrinter::finishAssembly(AsmPrinter &AP) {
-  unsigned IntPtrSize =
-      AP.TM.getSubtargetImpl()->getDataLayout()->getPointerSize();
+void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
+                                            AsmPrinter &AP) {
+  unsigned IntPtrSize = AP.TM.getDataLayout()->getPointerSize();
 
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getTextSection());
-  EmitCamlGlobal(getModule(), AP, "code_end");
+  EmitCamlGlobal(M, AP, "code_end");
 
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getDataSection());
-  EmitCamlGlobal(getModule(), AP, "data_end");
+  EmitCamlGlobal(M, AP, "data_end");
 
   // FIXME: Why does ocaml emit this??
   AP.OutStreamer.EmitIntValue(0, IntPtrSize);
 
   AP.OutStreamer.SwitchSection(AP.getObjFileLowering().getDataSection());
-  EmitCamlGlobal(getModule(), AP, "frametable");
+  EmitCamlGlobal(M, AP, "frametable");
 
   int NumDescriptors = 0;
-  for (iterator I = begin(), IE = end(); I != IE; ++I) {
+  for (GCModuleInfo::FuncInfoVec::iterator I = Info.funcinfo_begin(),
+                                           IE = Info.funcinfo_end();
+       I != IE; ++I) {
     GCFunctionInfo &FI = **I;
+    if (FI.getStrategy().getName() != getStrategy().getName())
+      // this function is managed by some other GC
+      continue;
     for (GCFunctionInfo::iterator J = FI.begin(), JE = FI.end(); J != JE; ++J) {
       NumDescriptors++;
     }
   }
 
-  if (NumDescriptors >= 1<<16) {
+  if (NumDescriptors >= 1 << 16) {
     // Very rude!
     report_fatal_error(" Too much descriptor for ocaml GC");
   }
   AP.EmitInt16(NumDescriptors);
   AP.EmitAlignment(IntPtrSize == 4 ? 2 : 3);
 
-  for (iterator I = begin(), IE = end(); I != IE; ++I) {
+  for (GCModuleInfo::FuncInfoVec::iterator I = Info.funcinfo_begin(),
+                                           IE = Info.funcinfo_end();
+       I != IE; ++I) {
     GCFunctionInfo &FI = **I;
+    if (FI.getStrategy().getName() != getStrategy().getName())
+      // this function is managed by some other GC
+      continue;
 
     uint64_t FrameSize = FI.getFrameSize();
-    if (FrameSize >= 1<<16) {
+    if (FrameSize >= 1 << 16) {
       // Very rude!
       report_fatal_error("Function '" + FI.getFunction().getName() +
                          "' is too large for the ocaml GC! "
-                         "Frame size " + Twine(FrameSize) + ">= 65536.\n"
-                         "(" + Twine(uintptr_t(&FI)) + ")");
+                         "Frame size " +
+                         Twine(FrameSize) + ">= 65536.\n"
+                                            "(" +
+                         Twine(uintptr_t(&FI)) + ")");
     }
 
     AP.OutStreamer.AddComment("live roots for " +
@@ -140,11 +152,12 @@
 
     for (GCFunctionInfo::iterator J = FI.begin(), JE = FI.end(); J != JE; ++J) {
       size_t LiveCount = FI.live_size(J);
-      if (LiveCount >= 1<<16) {
+      if (LiveCount >= 1 << 16) {
         // Very rude!
         report_fatal_error("Function '" + FI.getFunction().getName() +
                            "' is too large for the ocaml GC! "
-                           "Live root count "+Twine(LiveCount)+" >= 65536.");
+                           "Live root count " +
+                           Twine(LiveCount) + " >= 65536.");
       }
 
       AP.OutStreamer.EmitSymbolValue(J->Label, IntPtrSize);
@@ -152,12 +165,13 @@
       AP.EmitInt16(LiveCount);
 
       for (GCFunctionInfo::live_iterator K = FI.live_begin(J),
-                                         KE = FI.live_end(J); K != KE; ++K) {
-        if (K->StackOffset >= 1<<16) {
+                                         KE = FI.live_end(J);
+           K != KE; ++K) {
+        if (K->StackOffset >= 1 << 16) {
           // Very rude!
           report_fatal_error(
-                 "GC root stack offset is outside of fixed stack frame and out "
-                 "of range for ocaml GC!");
+              "GC root stack offset is outside of fixed stack frame and out "
+              "of range for ocaml GC!");
         }
         AP.EmitInt16(K->StackOffset);
       }

diff --git a/lib/CodeGen/AsmPrinter/Win64Exception.cpp b/lib/CodeGen/AsmPrinter/Win64Exception.cpp
index 0f0ad75..2b03877 100644
--- a/lib/CodeGen/AsmPrinter/Win64Exception.cpp
+++ b/lib/CodeGen/AsmPrinter/Win64Exception.cpp

@@ -60,7 +60,7 @@
 
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   unsigned PerEncoding = TLOF.getPersonalityEncoding();
-  const Function *Per = MMI->getPersonalities()[MMI->getPersonalityIndex()];
+  const Function *Per = MF->getMMI().getPersonality();
 
   shouldEmitPersonality = hasLandingPads &&
     PerEncoding != dwarf::DW_EH_PE_omit && Per;
@@ -99,9 +99,151 @@
 
   if (shouldEmitPersonality) {
     Asm->OutStreamer.PushSection();
+
+    // Emit an UNWIND_INFO struct describing the prologue.
     Asm->OutStreamer.EmitWinEHHandlerData();
-    emitExceptionTable();
+
+    // Emit the tables appropriate to the personality function in use. If we
+    // don't recognize the personality, assume it uses an Itanium-style LSDA.
+    EHPersonality Per = MMI->getPersonalityType();
+    if (Per == EHPersonality::MSVC_Win64SEH)
+      emitCSpecificHandlerTable();
+    else
+      emitExceptionTable();
+
     Asm->OutStreamer.PopSection();
   }
   Asm->OutStreamer.EmitWinCFIEndProc();
 }
+
+const MCSymbolRefExpr *Win64Exception::createImageRel32(const MCSymbol *Value) {
+  return MCSymbolRefExpr::Create(Value, MCSymbolRefExpr::VK_COFF_IMGREL32,
+                                 Asm->OutContext);
+}
+
+/// Emit the language-specific data that __C_specific_handler expects.  This
+/// handler lives in the x64 Microsoft C runtime and allows catching or cleaning
+/// up after faults with __try, __except, and __finally.  The typeinfo values
+/// are not really RTTI data, but pointers to filter functions that return an
+/// integer (1, 0, or -1) indicating how to handle the exception. For __finally
+/// blocks and other cleanups, the landing pad label is zero, and the filter
+/// function is actually a cleanup handler with the same prototype.  A catch-all
+/// entry is modeled with a null filter function field and a non-zero landing
+/// pad label.
+///
+/// Possible filter function return values:
+///   EXCEPTION_EXECUTE_HANDLER (1):
+///     Jump to the landing pad label after cleanups.
+///   EXCEPTION_CONTINUE_SEARCH (0):
+///     Continue searching this table or continue unwinding.
+///   EXCEPTION_CONTINUE_EXECUTION (-1):
+///     Resume execution at the trapping PC.
+///
+/// Inferred table structure:
+///   struct Table {
+///     int NumEntries;
+///     struct Entry {
+///       imagerel32 LabelStart;
+///       imagerel32 LabelEnd;
+///       imagerel32 FilterOrFinally;  // One means catch-all.
+///       imagerel32 LabelLPad;        // Zero means __finally.
+///     } Entries[NumEntries];
+///   };
+void Win64Exception::emitCSpecificHandlerTable() {
+  const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads();
+
+  // Simplifying assumptions for first implementation:
+  // - Cleanups are not implemented.
+  // - Filters are not implemented.
+
+  // The Itanium LSDA table sorts similar landing pads together to simplify the
+  // actions table, but we don't need that.
+  SmallVector<const LandingPadInfo *, 64> LandingPads;
+  LandingPads.reserve(PadInfos.size());
+  for (const auto &LP : PadInfos)
+    LandingPads.push_back(&LP);
+
+  // Compute label ranges for call sites as we would for the Itanium LSDA, but
+  // use an all zero action table because we aren't using these actions.
+  SmallVector<unsigned, 64> FirstActions;
+  FirstActions.resize(LandingPads.size());
+  SmallVector<CallSiteEntry, 64> CallSites;
+  computeCallSiteTable(CallSites, LandingPads, FirstActions);
+
+  MCSymbol *EHFuncBeginSym =
+      Asm->GetTempSymbol("eh_func_begin", Asm->getFunctionNumber());
+  MCSymbol *EHFuncEndSym =
+      Asm->GetTempSymbol("eh_func_end", Asm->getFunctionNumber());
+
+  // Emit the number of table entries.
+  unsigned NumEntries = 0;
+  for (const CallSiteEntry &CSE : CallSites) {
+    if (!CSE.LPad)
+      continue; // Ignore gaps.
+    for (int Selector : CSE.LPad->TypeIds) {
+      // Ignore C++ filter clauses in SEH.
+      // FIXME: Implement cleanup clauses.
+      if (isCatchEHSelector(Selector))
+        ++NumEntries;
+    }
+  }
+  Asm->OutStreamer.EmitIntValue(NumEntries, 4);
+
+  // Emit the four-label records for each call site entry. The table has to be
+  // sorted in layout order, and the call sites should already be sorted.
+  for (const CallSiteEntry &CSE : CallSites) {
+    // Ignore gaps. Unlike the Itanium model, unwinding through a frame without
+    // an EH table entry will propagate the exception rather than terminating
+    // the program.
+    if (!CSE.LPad)
+      continue;
+    const LandingPadInfo *LPad = CSE.LPad;
+
+    // Compute the label range. We may reuse the function begin and end labels
+    // rather than forming new ones.
+    const MCExpr *Begin =
+        createImageRel32(CSE.BeginLabel ? CSE.BeginLabel : EHFuncBeginSym);
+    const MCExpr *End;
+    if (CSE.EndLabel) {
+      // The interval is half-open, so we have to add one to include the return
+      // address of the last invoke in the range.
+      End = MCBinaryExpr::CreateAdd(createImageRel32(CSE.EndLabel),
+                                    MCConstantExpr::Create(1, Asm->OutContext),
+                                    Asm->OutContext);
+    } else {
+      End = createImageRel32(EHFuncEndSym);
+    }
+
+    // These aren't really type info globals, they are actually pointers to
+    // filter functions ordered by selector. The zero selector is used for
+    // cleanups, so slot zero corresponds to selector 1.
+    const std::vector<const GlobalValue *> &SelectorToFilter = MMI->getTypeInfos();
+
+    // Do a parallel iteration across typeids and clause labels, skipping filter
+    // clauses.
+    size_t NextClauseLabel = 0;
+    for (size_t I = 0, E = LPad->TypeIds.size(); I < E; ++I) {
+      // AddLandingPadInfo stores the clauses in reverse, but there is a FIXME
+      // to change that.
+      int Selector = LPad->TypeIds[E - I - 1];
+
+      // Ignore C++ filter clauses in SEH.
+      // FIXME: Implement cleanup clauses.
+      if (!isCatchEHSelector(Selector))
+        continue;
+
+      Asm->OutStreamer.EmitValue(Begin, 4);
+      Asm->OutStreamer.EmitValue(End, 4);
+      if (isCatchEHSelector(Selector)) {
+        assert(unsigned(Selector - 1) < SelectorToFilter.size());
+        const GlobalValue *TI = SelectorToFilter[Selector - 1];
+        if (TI) // Emit the filter function pointer.
+          Asm->OutStreamer.EmitValue(createImageRel32(Asm->getSymbol(TI)), 4);
+        else  // Otherwise, this is a "catch i8* null", or catch all.
+          Asm->OutStreamer.EmitIntValue(1, 4);
+      }
+      MCSymbol *ClauseLabel = LPad->ClauseLabels[NextClauseLabel++];
+      Asm->OutStreamer.EmitValue(createImageRel32(ClauseLabel), 4);
+    }
+  }
+}

diff --git a/lib/CodeGen/AsmPrinter/Win64Exception.h b/lib/CodeGen/AsmPrinter/Win64Exception.h
index 538e132..b2d5d1b 100644
--- a/lib/CodeGen/AsmPrinter/Win64Exception.h
+++ b/lib/CodeGen/AsmPrinter/Win64Exception.h

@@ -29,6 +29,10 @@
   /// Per-function flag to indicate if frame moves info should be emitted.
   bool shouldEmitMoves;
 
+  void emitCSpecificHandlerTable();
+
+  const MCSymbolRefExpr *createImageRel32(const MCSymbol *Value);
+
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.

diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp
index 12f6bd7..4b64be0 100644
--- a/lib/CodeGen/AtomicExpandPass.cpp
+++ b/lib/CodeGen/AtomicExpandPass.cpp

@@ -31,10 +31,11 @@
 namespace {
   class AtomicExpand: public FunctionPass {
     const TargetMachine *TM;
+    const TargetLowering *TLI;
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit AtomicExpand(const TargetMachine *TM = nullptr)
-      : FunctionPass(ID), TM(TM) {
+      : FunctionPass(ID), TM(TM), TLI(nullptr) {
       initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
     }
 
@@ -67,9 +68,9 @@
 }
 
 bool AtomicExpand::runOnFunction(Function &F) {
-  if (!TM || !TM->getSubtargetImpl()->enableAtomicExpand())
+  if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand())
     return false;
-  auto TargetLowering = TM->getSubtargetImpl()->getTargetLowering();
+  TLI = TM->getSubtargetImpl(F)->getTargetLowering();
 
   SmallVector<Instruction *, 1> AtomicInsts;
 
@@ -91,7 +92,7 @@
 
     auto FenceOrdering = Monotonic;
     bool IsStore, IsLoad;
-    if (TargetLowering->getInsertFencesForAtomic()) {
+    if (TLI->getInsertFencesForAtomic()) {
       if (LI && isAtLeastAcquire(LI->getOrdering())) {
         FenceOrdering = LI->getOrdering();
         LI->setOrdering(Monotonic);
@@ -107,9 +108,9 @@
         FenceOrdering = RMWI->getOrdering();
         RMWI->setOrdering(Monotonic);
         IsStore = IsLoad = true;
-      } else if (CASI && !TargetLowering->hasLoadLinkedStoreConditional() &&
-                    (isAtLeastRelease(CASI->getSuccessOrdering()) ||
-                     isAtLeastAcquire(CASI->getSuccessOrdering()))) {
+      } else if (CASI && !TLI->hasLoadLinkedStoreConditional() &&
+                 (isAtLeastRelease(CASI->getSuccessOrdering()) ||
+                  isAtLeastAcquire(CASI->getSuccessOrdering()))) {
         // If a compare and swap is lowered to LL/SC, we can do smarter fence
         // insertion, with a stronger one on the success path than on the
         // failure path. As a result, fence insertion is directly done by
@@ -125,20 +126,19 @@
       }
     }
 
-    if (LI && TargetLowering->shouldExpandAtomicLoadInIR(LI)) {
+    if (LI && TLI->shouldExpandAtomicLoadInIR(LI)) {
       MadeChange |= expandAtomicLoad(LI);
-    } else if (SI && TargetLowering->shouldExpandAtomicStoreInIR(SI)) {
+    } else if (SI && TLI->shouldExpandAtomicStoreInIR(SI)) {
       MadeChange |= expandAtomicStore(SI);
     } else if (RMWI) {
       // There are two different ways of expanding RMW instructions:
       // - into a load if it is idempotent
       // - into a Cmpxchg/LL-SC loop otherwise
       // we try them in that order.
-      MadeChange |= (isIdempotentRMW(RMWI) &&
-                        simplifyIdempotentRMW(RMWI)) ||
-                    (TargetLowering->shouldExpandAtomicRMWInIR(RMWI) &&
-                        expandAtomicRMW(RMWI));
-    } else if (CASI && TargetLowering->hasLoadLinkedStoreConditional()) {
+      MadeChange |=
+          (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) ||
+          (TLI->shouldExpandAtomicRMWInIR(RMWI) && expandAtomicRMW(RMWI));
+    } else if (CASI && TLI->hasLoadLinkedStoreConditional()) {
       MadeChange |= expandAtomicCmpXchg(CASI);
     }
   }
@@ -149,13 +149,9 @@
                                          bool IsStore, bool IsLoad) {
   IRBuilder<> Builder(I);
 
-  auto LeadingFence =
-      TM->getSubtargetImpl()->getTargetLowering()->emitLeadingFence(
-      Builder, Order, IsStore, IsLoad);
+  auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad);
 
-  auto TrailingFence =
-      TM->getSubtargetImpl()->getTargetLowering()->emitTrailingFence(
-      Builder, Order, IsStore, IsLoad);
+  auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad);
   // The trailing fence is emitted before the instruction instead of after
   // because there is no easy way of setting Builder insertion point after
   // an instruction. So we must erase it from the BB, and insert it back
@@ -171,16 +167,13 @@
 }
 
 bool AtomicExpand::expandAtomicLoad(LoadInst *LI) {
-   if (TM->getSubtargetImpl()
-          ->getTargetLowering()
-          ->hasLoadLinkedStoreConditional())
+  if (TLI->hasLoadLinkedStoreConditional())
     return expandAtomicLoadToLL(LI);
   else
     return expandAtomicLoadToCmpXchg(LI);
 }
 
 bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
-  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
   IRBuilder<> Builder(LI);
 
   // On some architectures, load-linked instructions are atomic for larger
@@ -231,9 +224,7 @@
 }
 
 bool AtomicExpand::expandAtomicRMW(AtomicRMWInst *AI) {
-  if (TM->getSubtargetImpl()
-          ->getTargetLowering()
-          ->hasLoadLinkedStoreConditional())
+  if (TLI->hasLoadLinkedStoreConditional())
     return expandAtomicRMWToLLSC(AI);
   else
     return expandAtomicRMWToCmpXchg(AI);
@@ -277,7 +268,6 @@
 }
 
 bool AtomicExpand::expandAtomicRMWToLLSC(AtomicRMWInst *AI) {
-  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
   AtomicOrdering MemOpOrder = AI->getOrdering();
   Value *Addr = AI->getPointerOperand();
   BasicBlock *BB = AI->getParent();
@@ -397,7 +387,6 @@
 }
 
 bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
-  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
   AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
   AtomicOrdering FailureOrder = CI->getFailureOrdering();
   Value *Addr = CI->getPointerOperand();
@@ -551,13 +540,10 @@
 }
 
 bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) {
-  auto TLI = TM->getSubtargetImpl()->getTargetLowering();
-
   if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
     if (TLI->shouldExpandAtomicLoadInIR(ResultingLoad))
       expandAtomicLoad(ResultingLoad);
     return true;
   }
-
   return false;
 }

diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp
index b9b1fd8..82f5c48 100644
--- a/lib/CodeGen/BasicTargetTransformInfo.cpp
+++ b/lib/CodeGen/BasicTargetTransformInfo.cpp

@@ -15,633 +15,23 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfoImpl.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 #include <utility>
 using namespace llvm;
 
-static cl::opt<unsigned>
-PartialUnrollingThreshold("partial-unrolling-threshold", cl::init(0),
-  cl::desc("Threshold for partial unrolling"), cl::Hidden);
-
 #define DEBUG_TYPE "basictti"
 
-namespace {
+// This flag is used by the template base class for BasicTTIImpl, and here to
+// provide a definition.
+cl::opt<unsigned>
+    llvm::PartialUnrollingThreshold("partial-unrolling-threshold", cl::init(0),
+                                    cl::desc("Threshold for partial unrolling"),
+                                    cl::Hidden);
 
-class BasicTTI final : public ImmutablePass, public TargetTransformInfo {
-  const TargetMachine *TM;
-
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
-
-  /// Estimate the cost overhead of SK_Alternate shuffle.
-  unsigned getAltShuffleOverhead(Type *Ty) const;
-
-  const TargetLoweringBase *getTLI() const {
-    return TM->getSubtargetImpl()->getTargetLowering();
-  }
-
-public:
-  BasicTTI() : ImmutablePass(ID), TM(nullptr) {
-    llvm_unreachable("This pass cannot be directly constructed");
-  }
-
-  BasicTTI(const TargetMachine *TM) : ImmutablePass(ID), TM(TM) {
-    initializeBasicTTIPass(*PassRegistry::getPassRegistry());
-  }
-
-  void initializePass() override {
-    pushTTIStack(this);
-  }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    TargetTransformInfo::getAnalysisUsage(AU);
-  }
-
-  /// Pass identification.
-  static char ID;
-
-  /// Provide necessary pointer adjustments for the two base classes.
-  void *getAdjustedAnalysisPointer(const void *ID) override {
-    if (ID == &TargetTransformInfo::ID)
-      return (TargetTransformInfo*)this;
-    return this;
-  }
-
-  bool hasBranchDivergence() const override;
-
-  /// \name Scalar TTI Implementations
-  /// @{
-
-  bool isLegalAddImmediate(int64_t imm) const override;
-  bool isLegalICmpImmediate(int64_t imm) const override;
-  bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
-                             int64_t BaseOffset, bool HasBaseReg,
-                             int64_t Scale) const override;
-  int getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
-                           int64_t BaseOffset, bool HasBaseReg,
-                           int64_t Scale) const override;
-  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
-  bool isTypeLegal(Type *Ty) const override;
-  unsigned getJumpBufAlignment() const override;
-  unsigned getJumpBufSize() const override;
-  bool shouldBuildLookupTables() const override;
-  bool haveFastSqrt(Type *Ty) const override;
-  void getUnrollingPreferences(const Function *F, Loop *L,
-                               UnrollingPreferences &UP) const override;
-
-  /// @}
-
-  /// \name Vector TTI Implementations
-  /// @{
-
-  unsigned getNumberOfRegisters(bool Vector) const override;
-  unsigned getMaxInterleaveFactor() const override;
-  unsigned getRegisterBitWidth(bool Vector) const override;
-  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, OperandValueKind,
-                                  OperandValueKind, OperandValueProperties,
-                                  OperandValueProperties) const override;
-  unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
-                          int Index, Type *SubTp) const override;
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
-                            Type *Src) const override;
-  unsigned getCFInstrCost(unsigned Opcode) const override;
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                              Type *CondTy) const override;
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
-                              unsigned Index) const override;
-  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace) const override;
-  unsigned getIntrinsicInstrCost(Intrinsic::ID, Type *RetTy,
-                                 ArrayRef<Type*> Tys) const override;
-  unsigned getNumberOfParts(Type *Tp) const override;
-  unsigned getAddressComputationCost( Type *Ty, bool IsComplex) const override;
-  unsigned getReductionCost(unsigned Opcode, Type *Ty,
-                            bool IsPairwise) const override;
-
-  /// @}
-};
-
-}
-
-INITIALIZE_AG_PASS(BasicTTI, TargetTransformInfo, "basictti",
-                   "Target independent code generator's TTI", true, true, false)
-char BasicTTI::ID = 0;
-
-ImmutablePass *
-llvm::createBasicTargetTransformInfoPass(const TargetMachine *TM) {
-  return new BasicTTI(TM);
-}
-
-bool BasicTTI::hasBranchDivergence() const { return false; }
-
-bool BasicTTI::isLegalAddImmediate(int64_t imm) const {
-  return getTLI()->isLegalAddImmediate(imm);
-}
-
-bool BasicTTI::isLegalICmpImmediate(int64_t imm) const {
-  return getTLI()->isLegalICmpImmediate(imm);
-}
-
-bool BasicTTI::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
-                                     int64_t BaseOffset, bool HasBaseReg,
-                                     int64_t Scale) const {
-  TargetLoweringBase::AddrMode AM;
-  AM.BaseGV = BaseGV;
-  AM.BaseOffs = BaseOffset;
-  AM.HasBaseReg = HasBaseReg;
-  AM.Scale = Scale;
-  return getTLI()->isLegalAddressingMode(AM, Ty);
-}
-
-int BasicTTI::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
-                                   int64_t BaseOffset, bool HasBaseReg,
-                                   int64_t Scale) const {
-  TargetLoweringBase::AddrMode AM;
-  AM.BaseGV = BaseGV;
-  AM.BaseOffs = BaseOffset;
-  AM.HasBaseReg = HasBaseReg;
-  AM.Scale = Scale;
-  return getTLI()->getScalingFactorCost(AM, Ty);
-}
-
-bool BasicTTI::isTruncateFree(Type *Ty1, Type *Ty2) const {
-  return getTLI()->isTruncateFree(Ty1, Ty2);
-}
-
-bool BasicTTI::isTypeLegal(Type *Ty) const {
-  EVT T = getTLI()->getValueType(Ty);
-  return getTLI()->isTypeLegal(T);
-}
-
-unsigned BasicTTI::getJumpBufAlignment() const {
-  return getTLI()->getJumpBufAlignment();
-}
-
-unsigned BasicTTI::getJumpBufSize() const {
-  return getTLI()->getJumpBufSize();
-}
-
-bool BasicTTI::shouldBuildLookupTables() const {
-  const TargetLoweringBase *TLI = getTLI();
-  return TLI->isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
-         TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
-}
-
-bool BasicTTI::haveFastSqrt(Type *Ty) const {
-  const TargetLoweringBase *TLI = getTLI();
-  EVT VT = TLI->getValueType(Ty);
-  return TLI->isTypeLegal(VT) && TLI->isOperationLegalOrCustom(ISD::FSQRT, VT);
-}
-
-void BasicTTI::getUnrollingPreferences(const Function *F, Loop *L,
-                                       UnrollingPreferences &UP) const {
-  // This unrolling functionality is target independent, but to provide some
-  // motivation for its intended use, for x86:
-
-  // According to the Intel 64 and IA-32 Architectures Optimization Reference
-  // Manual, Intel Core models and later have a loop stream detector
-  // (and associated uop queue) that can benefit from partial unrolling.
-  // The relevant requirements are:
-  //  - The loop must have no more than 4 (8 for Nehalem and later) branches
-  //    taken, and none of them may be calls.
-  //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
-
-  // According to the Software Optimization Guide for AMD Family 15h Processors,
-  // models 30h-4fh (Steamroller and later) have a loop predictor and loop
-  // buffer which can benefit from partial unrolling.
-  // The relevant requirements are:
-  //  - The loop must have fewer than 16 branches
-  //  - The loop must have less than 40 uops in all executed loop branches
-
-  // The number of taken branches in a loop is hard to estimate here, and
-  // benchmarking has revealed that it is better not to be conservative when
-  // estimating the branch count. As a result, we'll ignore the branch limits
-  // until someone finds a case where it matters in practice.
-
-  unsigned MaxOps;
-  const TargetSubtargetInfo *ST = &TM->getSubtarget<TargetSubtargetInfo>(F);
-  if (PartialUnrollingThreshold.getNumOccurrences() > 0)
-    MaxOps = PartialUnrollingThreshold;
-  else if (ST->getSchedModel().LoopMicroOpBufferSize > 0)
-    MaxOps = ST->getSchedModel().LoopMicroOpBufferSize;
-  else
-    return;
-
-  // Scan the loop: don't unroll loops with calls.
-  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
-       I != E; ++I) {
-    BasicBlock *BB = *I;
-
-    for (BasicBlock::iterator J = BB->begin(), JE = BB->end(); J != JE; ++J)
-      if (isa<CallInst>(J) || isa<InvokeInst>(J)) {
-        ImmutableCallSite CS(J);
-        if (const Function *F = CS.getCalledFunction()) {
-          if (!TopTTI->isLoweredToCall(F))
-            continue;
-        }
-
-        return;
-      }
-  }
-
-  // Enable runtime and partial unrolling up to the specified size.
-  UP.Partial = UP.Runtime = true;
-  UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
-}
-
-//===----------------------------------------------------------------------===//
-//
-// Calls used by the vectorizers.
-//
-//===----------------------------------------------------------------------===//
-
-unsigned BasicTTI::getScalarizationOverhead(Type *Ty, bool Insert,
-                                            bool Extract) const {
-  assert (Ty->isVectorTy() && "Can only scalarize vectors");
-  unsigned Cost = 0;
-
-  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
-    if (Insert)
-      Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
-    if (Extract)
-      Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
-  }
-
-  return Cost;
-}
-
-unsigned BasicTTI::getNumberOfRegisters(bool Vector) const {
-  return 1;
-}
-
-unsigned BasicTTI::getRegisterBitWidth(bool Vector) const {
-  return 32;
-}
-
-unsigned BasicTTI::getMaxInterleaveFactor() const {
-  return 1;
-}
-
-unsigned BasicTTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                          OperandValueKind, OperandValueKind,
-                                          OperandValueProperties,
-                                          OperandValueProperties) const {
-  // Check if any of the operands are vector operands.
-  const TargetLoweringBase *TLI = getTLI();
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  assert(ISD && "Invalid opcode");
-
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
-
-  bool IsFloat = Ty->getScalarType()->isFloatingPointTy();
-  // Assume that floating point arithmetic operations cost twice as much as
-  // integer operations.
-  unsigned OpCost = (IsFloat ? 2 : 1);
-
-  if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
-    // The operation is legal. Assume it costs 1.
-    // If the type is split to multiple registers, assume that there is some
-    // overhead to this.
-    // TODO: Once we have extract/insert subvector cost we need to use them.
-    if (LT.first > 1)
-      return LT.first * 2 * OpCost;
-    return LT.first * 1 * OpCost;
-  }
-
-  if (!TLI->isOperationExpand(ISD, LT.second)) {
-    // If the operation is custom lowered then assume
-    // thare the code is twice as expensive.
-    return LT.first * 2 * OpCost;
-  }
-
-  // Else, assume that we need to scalarize this op.
-  if (Ty->isVectorTy()) {
-    unsigned Num = Ty->getVectorNumElements();
-    unsigned Cost = TopTTI->getArithmeticInstrCost(Opcode, Ty->getScalarType());
-    // return the cost of multiple scalar invocation plus the cost of inserting
-    // and extracting the values.
-    return getScalarizationOverhead(Ty, true, true) + Num * Cost;
-  }
-
-  // We don't know anything about this scalar instruction.
-  return OpCost;
-}
-
-unsigned BasicTTI::getAltShuffleOverhead(Type *Ty) const {
-  assert(Ty->isVectorTy() && "Can only shuffle vectors");
-  unsigned Cost = 0;
-  // Shuffle cost is equal to the cost of extracting element from its argument
-  // plus the cost of inserting them onto the result vector.
-
-  // e.g. <4 x float> has a mask of <0,5,2,7> i.e we need to extract from index
-  // 0 of first vector, index 1 of second vector,index 2 of first vector and
-  // finally index 3 of second vector and insert them at index <0,1,2,3> of
-  // result vector.
-  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
-    Cost += TopTTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
-    Cost += TopTTI->getVectorInstrCost(Instruction::ExtractElement, Ty, i);
-  }
-  return Cost;
-}
-
-unsigned BasicTTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
-                                  Type *SubTp) const {
-  if (Kind == SK_Alternate) {
-    return getAltShuffleOverhead(Tp);
-  }
-  return 1;
-}
-
-unsigned BasicTTI::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                    Type *Src) const {
-  const TargetLoweringBase *TLI = getTLI();
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  assert(ISD && "Invalid opcode");
-
-  std::pair<unsigned, MVT> SrcLT = TLI->getTypeLegalizationCost(Src);
-  std::pair<unsigned, MVT> DstLT = TLI->getTypeLegalizationCost(Dst);
-
-  // Check for NOOP conversions.
-  if (SrcLT.first == DstLT.first &&
-      SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
-
-      // Bitcast between types that are legalized to the same type are free.
-      if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc)
-        return 0;
-  }
-
-  if (Opcode == Instruction::Trunc &&
-      TLI->isTruncateFree(SrcLT.second, DstLT.second))
-    return 0;
-
-  if (Opcode == Instruction::ZExt &&
-      TLI->isZExtFree(SrcLT.second, DstLT.second))
-    return 0;
-
-  // If the cast is marked as legal (or promote) then assume low cost.
-  if (SrcLT.first == DstLT.first &&
-      TLI->isOperationLegalOrPromote(ISD, DstLT.second))
-    return 1;
-
-  // Handle scalar conversions.
-  if (!Src->isVectorTy() && !Dst->isVectorTy()) {
-
-    // Scalar bitcasts are usually free.
-    if (Opcode == Instruction::BitCast)
-      return 0;
-
-    // Just check the op cost. If the operation is legal then assume it costs 1.
-    if (!TLI->isOperationExpand(ISD, DstLT.second))
-      return  1;
-
-    // Assume that illegal scalar instruction are expensive.
-    return 4;
-  }
-
-  // Check vector-to-vector casts.
-  if (Dst->isVectorTy() && Src->isVectorTy()) {
-
-    // If the cast is between same-sized registers, then the check is simple.
-    if (SrcLT.first == DstLT.first &&
-        SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
-
-      // Assume that Zext is done using AND.
-      if (Opcode == Instruction::ZExt)
-        return 1;
-
-      // Assume that sext is done using SHL and SRA.
-      if (Opcode == Instruction::SExt)
-        return 2;
-
-      // Just check the op cost. If the operation is legal then assume it costs
-      // 1 and multiply by the type-legalization overhead.
-      if (!TLI->isOperationExpand(ISD, DstLT.second))
-        return SrcLT.first * 1;
-    }
-
-    // If we are converting vectors and the operation is illegal, or
-    // if the vectors are legalized to different types, estimate the
-    // scalarization costs.
-    unsigned Num = Dst->getVectorNumElements();
-    unsigned Cost = TopTTI->getCastInstrCost(Opcode, Dst->getScalarType(),
-                                             Src->getScalarType());
-
-    // Return the cost of multiple scalar invocation plus the cost of
-    // inserting and extracting the values.
-    return getScalarizationOverhead(Dst, true, true) + Num * Cost;
-  }
-
-  // We already handled vector-to-vector and scalar-to-scalar conversions. This
-  // is where we handle bitcast between vectors and scalars. We need to assume
-  //  that the conversion is scalarized in one way or another.
-  if (Opcode == Instruction::BitCast)
-    // Illegal bitcasts are done by storing and loading from a stack slot.
-    return (Src->isVectorTy()? getScalarizationOverhead(Src, false, true):0) +
-           (Dst->isVectorTy()? getScalarizationOverhead(Dst, true, false):0);
-
-  llvm_unreachable("Unhandled cast");
- }
-
-unsigned BasicTTI::getCFInstrCost(unsigned Opcode) const {
-  // Branches are assumed to be predicted.
-  return 0;
-}
-
-unsigned BasicTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                      Type *CondTy) const {
-  const TargetLoweringBase *TLI = getTLI();
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  assert(ISD && "Invalid opcode");
-
-  // Selects on vectors are actually vector selects.
-  if (ISD == ISD::SELECT) {
-    assert(CondTy && "CondTy must exist");
-    if (CondTy->isVectorTy())
-      ISD = ISD::VSELECT;
-  }
-
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
-
-  if (!(ValTy->isVectorTy() && !LT.second.isVector()) &&
-      !TLI->isOperationExpand(ISD, LT.second)) {
-    // The operation is legal. Assume it costs 1. Multiply
-    // by the type-legalization overhead.
-    return LT.first * 1;
-  }
-
-  // Otherwise, assume that the cast is scalarized.
-  if (ValTy->isVectorTy()) {
-    unsigned Num = ValTy->getVectorNumElements();
-    if (CondTy)
-      CondTy = CondTy->getScalarType();
-    unsigned Cost = TopTTI->getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
-                                               CondTy);
-
-    // Return the cost of multiple scalar invocation plus the cost of inserting
-    // and extracting the values.
-    return getScalarizationOverhead(ValTy, true, false) + Num * Cost;
-  }
-
-  // Unknown scalar opcode.
-  return 1;
-}
-
-unsigned BasicTTI::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                      unsigned Index) const {
-  std::pair<unsigned, MVT> LT =  getTLI()->getTypeLegalizationCost(Val->getScalarType());
-
-  return LT.first;
-}
-
-unsigned BasicTTI::getMemoryOpCost(unsigned Opcode, Type *Src,
-                                   unsigned Alignment,
-                                   unsigned AddressSpace) const {
-  assert(!Src->isVoidTy() && "Invalid type");
-  std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(Src);
-
-  // Assuming that all loads of legal types cost 1.
-  unsigned Cost = LT.first;
-
-  if (Src->isVectorTy() &&
-      Src->getPrimitiveSizeInBits() < LT.second.getSizeInBits()) {
-    // This is a vector load that legalizes to a larger type than the vector
-    // itself. Unless the corresponding extending load or truncating store is
-    // legal, then this will scalarize.
-    TargetLowering::LegalizeAction LA = TargetLowering::Expand;
-    EVT MemVT = getTLI()->getValueType(Src, true);
-    if (MemVT.isSimple() && MemVT != MVT::Other) {
-      if (Opcode == Instruction::Store)
-        LA = getTLI()->getTruncStoreAction(LT.second, MemVT.getSimpleVT());
-      else
-        LA = getTLI()->getLoadExtAction(ISD::EXTLOAD, MemVT.getSimpleVT());
-    }
-
-    if (LA != TargetLowering::Legal && LA != TargetLowering::Custom) {
-      // This is a vector load/store for some illegal type that is scalarized.
-      // We must account for the cost of building or decomposing the vector.
-      Cost += getScalarizationOverhead(Src, Opcode != Instruction::Store,
-                                            Opcode == Instruction::Store);
-    }
-  }
-
-  return Cost;
-}
-
-unsigned BasicTTI::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                                         ArrayRef<Type *> Tys) const {
-  unsigned ISD = 0;
-  switch (IID) {
-  default: {
-    // Assume that we need to scalarize this intrinsic.
-    unsigned ScalarizationCost = 0;
-    unsigned ScalarCalls = 1;
-    if (RetTy->isVectorTy()) {
-      ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
-      ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
-    }
-    for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
-      if (Tys[i]->isVectorTy()) {
-        ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
-        ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
-      }
-    }
-
-    return ScalarCalls + ScalarizationCost;
-  }
-  // Look for intrinsics that can be lowered directly or turned into a scalar
-  // intrinsic call.
-  case Intrinsic::sqrt:    ISD = ISD::FSQRT;  break;
-  case Intrinsic::sin:     ISD = ISD::FSIN;   break;
-  case Intrinsic::cos:     ISD = ISD::FCOS;   break;
-  case Intrinsic::exp:     ISD = ISD::FEXP;   break;
-  case Intrinsic::exp2:    ISD = ISD::FEXP2;  break;
-  case Intrinsic::log:     ISD = ISD::FLOG;   break;
-  case Intrinsic::log10:   ISD = ISD::FLOG10; break;
-  case Intrinsic::log2:    ISD = ISD::FLOG2;  break;
-  case Intrinsic::fabs:    ISD = ISD::FABS;   break;
-  case Intrinsic::minnum:  ISD = ISD::FMINNUM; break;
-  case Intrinsic::maxnum:  ISD = ISD::FMAXNUM; break;
-  case Intrinsic::copysign: ISD = ISD::FCOPYSIGN; break;
-  case Intrinsic::floor:   ISD = ISD::FFLOOR; break;
-  case Intrinsic::ceil:    ISD = ISD::FCEIL;  break;
-  case Intrinsic::trunc:   ISD = ISD::FTRUNC; break;
-  case Intrinsic::nearbyint:
-                           ISD = ISD::FNEARBYINT; break;
-  case Intrinsic::rint:    ISD = ISD::FRINT;  break;
-  case Intrinsic::round:   ISD = ISD::FROUND; break;
-  case Intrinsic::pow:     ISD = ISD::FPOW;   break;
-  case Intrinsic::fma:     ISD = ISD::FMA;    break;
-  case Intrinsic::fmuladd: ISD = ISD::FMA;    break;
-  // FIXME: We should return 0 whenever getIntrinsicCost == TCC_Free.
-  case Intrinsic::lifetime_start:
-  case Intrinsic::lifetime_end:
-    return 0;
-  }
-
-  const TargetLoweringBase *TLI = getTLI();
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(RetTy);
-
-  if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
-    // The operation is legal. Assume it costs 1.
-    // If the type is split to multiple registers, assume that there is some
-    // overhead to this.
-    // TODO: Once we have extract/insert subvector cost we need to use them.
-    if (LT.first > 1)
-      return LT.first * 2;
-    return LT.first * 1;
-  }
-
-  if (!TLI->isOperationExpand(ISD, LT.second)) {
-    // If the operation is custom lowered then assume
-    // thare the code is twice as expensive.
-    return LT.first * 2;
-  }
-
-  // If we can't lower fmuladd into an FMA estimate the cost as a floating
-  // point mul followed by an add.
-  if (IID == Intrinsic::fmuladd)
-    return TopTTI->getArithmeticInstrCost(BinaryOperator::FMul, RetTy) +
-           TopTTI->getArithmeticInstrCost(BinaryOperator::FAdd, RetTy);
-
-  // Else, assume that we need to scalarize this intrinsic. For math builtins
-  // this will emit a costly libcall, adding call overhead and spills. Make it
-  // very expensive.
-  if (RetTy->isVectorTy()) {
-    unsigned Num = RetTy->getVectorNumElements();
-    unsigned Cost = TopTTI->getIntrinsicInstrCost(IID, RetTy->getScalarType(),
-                                                  Tys);
-    return 10 * Cost * Num;
-  }
-
-  // This is going to be turned into a library call, make it expensive.
-  return 10;
-}
-
-unsigned BasicTTI::getNumberOfParts(Type *Tp) const {
-  std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(Tp);
-  return LT.first;
-}
-
-unsigned BasicTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
-  return 0;
-}
-
-unsigned BasicTTI::getReductionCost(unsigned Opcode, Type *Ty,
-                                    bool IsPairwise) const {
-  assert(Ty->isVectorTy() && "Expect a vector type");
-  unsigned NumVecElts = Ty->getVectorNumElements();
-  unsigned NumReduxLevels = Log2_32(NumVecElts);
-  unsigned ArithCost = NumReduxLevels *
-    TopTTI->getArithmeticInstrCost(Opcode, Ty);
-  // Assume the pairwise shuffles add a cost.
-  unsigned ShuffleCost =
-      NumReduxLevels * (IsPairwise + 1) *
-      TopTTI->getShuffleCost(SK_ExtractSubvector, Ty, NumVecElts / 2, Ty);
-  return ShuffleCost + ArithCost + getScalarizationOverhead(Ty, false, true);
-}
+BasicTTIImpl::BasicTTIImpl(const TargetMachine *TM, Function &F)
+    : BaseT(TM), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {}

diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 2128da1..b8f05cd 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp

@@ -601,8 +601,7 @@
   // instructions that would be deleted in the merge.
   MachineFunction *MF = MBB1->getParent();
   if (EffectiveTailLen >= 2 &&
-      MF->getFunction()->getAttributes().
-        hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
+      MF->getFunction()->hasFnAttribute(Attribute::OptimizeForSize) &&
       (I1 == MBB1->begin() || I2 == MBB2->begin()))
     return true;
 

diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 092346b..f21d4d2 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt

@@ -22,6 +22,7 @@
   ForwardControlFlowIntegrity.cpp
   GCMetadata.cpp
   GCMetadataPrinter.cpp
+  GCRootLowering.cpp
   GCStrategy.cpp
   GlobalMerge.cpp
   IfConversion.cpp
@@ -95,6 +96,7 @@
   ScheduleDAGPrinter.cpp
   ScoreboardHazardRecognizer.cpp
   ShadowStackGC.cpp
+  ShadowStackGCLowering.cpp
   SjLjEHPrepare.cpp
   SlotIndexes.cpp
   SpillPlacement.cpp
@@ -104,6 +106,7 @@
   StackSlotColoring.cpp
   StackMapLivenessAnalysis.cpp
   StackMaps.cpp
+  StatepointExampleGC.cpp
   TailDuplication.cpp
   TargetFrameLoweringImpl.cpp
   TargetInstrInfo.cpp
@@ -115,6 +118,11 @@
   TwoAddressInstructionPass.cpp
   UnreachableBlockElim.cpp
   VirtRegMap.cpp
+  WinEHPrepare.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/CodeGen
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/CodeGen/PBQP
   )
 
 add_dependencies(LLVMCodeGen intrinsics_gen)

diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
index 56ecde0..034ffb3 100644
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp

@@ -14,9 +14,11 @@
 
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SaveAndRestore.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -178,3 +180,70 @@
     llvm_unreachable(nullptr);
   }
 }
+
+static bool isValueTypeInRegForCC(CallingConv::ID CC, MVT VT) {
+  if (VT.isVector())
+    return true; // Assume -msse-regparm might be in effect.
+  if (!VT.isInteger())
+    return false;
+  if (CC == CallingConv::X86_VectorCall || CC == CallingConv::X86_FastCall)
+    return true;
+  return false;
+}
+
+void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs,
+                                          MVT VT, CCAssignFn Fn) {
+  unsigned SavedStackOffset = StackOffset;
+  unsigned NumLocs = Locs.size();
+
+  // Set the 'inreg' flag if it is used for this calling convention.
+  ISD::ArgFlagsTy Flags;
+  if (isValueTypeInRegForCC(CallingConv, VT))
+    Flags.setInReg();
+
+  // Allocate something of this value type repeatedly until we get assigned a
+  // location in memory.
+  bool HaveRegParm = true;
+  while (HaveRegParm) {
+    if (Fn(0, VT, VT, CCValAssign::Full, Flags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call has unhandled type " << EVT(VT).getEVTString()
+             << " while computing remaining regparms\n";
+#endif
+      llvm_unreachable(nullptr);
+    }
+    HaveRegParm = Locs.back().isRegLoc();
+  }
+
+  // Copy all the registers from the value locations we added.
+  assert(NumLocs < Locs.size() && "CC assignment failed to add location");
+  for (unsigned I = NumLocs, E = Locs.size(); I != E; ++I)
+    if (Locs[I].isRegLoc())
+      Regs.push_back(MCPhysReg(Locs[I].getLocReg()));
+
+  // Clear the assigned values and stack memory. We leave the registers marked
+  // as allocated so that future queries don't return the same registers, i.e.
+  // when i64 and f64 are both passed in GPRs.
+  StackOffset = SavedStackOffset;
+  Locs.resize(NumLocs);
+}
+
+void CCState::analyzeMustTailForwardedRegisters(
+    SmallVectorImpl<ForwardedRegister> &Forwards, ArrayRef<MVT> RegParmTypes,
+    CCAssignFn Fn) {
+  // Oftentimes calling conventions will not user register parameters for
+  // variadic functions, so we need to assume we're not variadic so that we get
+  // all the registers that might be used in a non-variadic call.
+  SaveAndRestore<bool> SavedVarArg(IsVarArg, false);
+
+  for (MVT RegVT : RegParmTypes) {
+    SmallVector<MCPhysReg, 8> RemainingRegs;
+    getRemainingRegParmsForType(RemainingRegs, RegVT, Fn);
+    const TargetLowering *TL = MF.getSubtarget().getTargetLowering();
+    const TargetRegisterClass *RC = TL->getRegClassFor(RegVT);
+    for (MCPhysReg PReg : RemainingRegs) {
+      unsigned VReg = MF.addLiveIn(PReg, RC);
+      Forwards.push_back(ForwardedRegister(VReg, PReg, RegVT));
+    }
+  }
+}

diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index 307dec5..7c0068e 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp

@@ -21,7 +21,6 @@
 /// initializeCodeGen - Initialize all passes linked into the CodeGen library.
 void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeAtomicExpandPass(Registry);
-  initializeBasicTTIPass(Registry);
   initializeBranchFolderPassPass(Registry);
   initializeCodeGenPreparePass(Registry);
   initializeDeadMachineInstructionElimPass(Registry);

diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 8d20848..c0d7dca 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp

@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
@@ -30,20 +31,22 @@
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Statepoint.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/IR/ValueMap.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -70,6 +73,10 @@
   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
   cl::desc("Disable branch optimizations in CodeGenPrepare"));
 
+static cl::opt<bool>
+    DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false),
+                  cl::desc("Disable GC optimizations in CodeGenPrepare"));
+
 static cl::opt<bool> DisableSelectToBranch(
   "disable-cgp-select2branch", cl::Hidden, cl::init(false),
   cl::desc("Disable select to branch conversion."));
@@ -90,6 +97,16 @@
     "stress-cgp-store-extract", cl::Hidden, cl::init(false),
     cl::desc("Stress test store(extract) optimizations in CodeGenPrepare"));
 
+static cl::opt<bool> DisableExtLdPromotion(
+    "disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
+    cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in "
+             "CodeGenPrepare"));
+
+static cl::opt<bool> StressExtLdPromotion(
+    "stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
+    cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) "
+             "optimization in CodeGenPrepare"));
+
 namespace {
 typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
 struct TypeIsSExt {
@@ -98,6 +115,7 @@
   TypeIsSExt(Type *Ty, bool IsSExt) : Ty(Ty), IsSExt(IsSExt) {}
 };
 typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy;
+class TypePromotionTransaction;
 
   class CodeGenPrepare : public FunctionPass {
     /// TLI - Keep a pointer of a TargetLowering to consult for determining
@@ -143,8 +161,8 @@
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addPreserved<DominatorTreeWrapperPass>();
-      AU.addRequired<TargetLibraryInfo>();
-      AU.addRequired<TargetTransformInfo>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
     }
 
   private:
@@ -152,12 +170,12 @@
     bool EliminateMostlyEmptyBlocks(Function &F);
     bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
     void EliminateMostlyEmptyBlock(BasicBlock *BB);
-    bool OptimizeBlock(BasicBlock &BB);
-    bool OptimizeInst(Instruction *I);
+    bool OptimizeBlock(BasicBlock &BB, bool& ModifiedDT);
+    bool OptimizeInst(Instruction *I, bool& ModifiedDT);
     bool OptimizeMemoryInst(Instruction *I, Value *Addr, Type *AccessTy);
     bool OptimizeInlineAsmInst(CallInst *CS);
-    bool OptimizeCallInst(CallInst *CI);
-    bool MoveExtToFormExtLoad(Instruction *I);
+    bool OptimizeCallInst(CallInst *CI, bool& ModifiedDT);
+    bool MoveExtToFormExtLoad(Instruction *&I);
     bool OptimizeExtUses(Instruction *I);
     bool OptimizeSelectInst(SelectInst *SI);
     bool OptimizeShuffleVectorInst(ShuffleVectorInst *SI);
@@ -165,6 +183,12 @@
     bool DupRetToEnableTailCallOpts(BasicBlock *BB);
     bool PlaceDbgValues(Function &F);
     bool sinkAndCmp(Function &F);
+    bool ExtLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI,
+                        Instruction *&Inst,
+                        const SmallVectorImpl<Instruction *> &Exts,
+                        unsigned CreatedInst);
+    bool splitBranchCondition(Function &F);
+    bool simplifyOffsetableRelocate(Instruction &I);
   };
 }
 
@@ -187,14 +211,13 @@
 
   ModifiedDT = false;
   if (TM)
-    TLI = TM->getSubtargetImpl()->getTargetLowering();
-  TLInfo = &getAnalysis<TargetLibraryInfo>();
-  TTI = &getAnalysis<TargetTransformInfo>();
+    TLI = TM->getSubtargetImpl(F)->getTargetLowering();
+  TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  OptSize = F.getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                           Attribute::OptimizeForSize);
+  OptSize = F.hasFnAttribute(Attribute::OptimizeForSize);
 
   /// This optimization identifies DIV instructions that can be
   /// profitably bypassed and carried out with a shorter, faster divide.
@@ -218,15 +241,23 @@
   // into a single target instruction, push the mask and compare into branch
   // users. Do this before OptimizeBlock -> OptimizeInst ->
   // OptimizeCmpExpression, which perturbs the pattern being searched for.
-  if (!DisableBranchOpts)
+  if (!DisableBranchOpts) {
     EverMadeChange |= sinkAndCmp(F);
+    EverMadeChange |= splitBranchCondition(F);
+  }
 
   bool MadeChange = true;
   while (MadeChange) {
     MadeChange = false;
     for (Function::iterator I = F.begin(); I != F.end(); ) {
       BasicBlock *BB = I++;
-      MadeChange |= OptimizeBlock(*BB);
+      bool ModifiedDTOnIteration = false;
+      MadeChange |= OptimizeBlock(*BB, ModifiedDTOnIteration);
+
+      // Restart BB iteration if the dominator tree of the Function was changed
+      ModifiedDT |= ModifiedDTOnIteration;
+      if (ModifiedDTOnIteration)
+        break;
     }
     EverMadeChange |= MadeChange;
   }
@@ -236,9 +267,9 @@
   if (!DisableBranchOpts) {
     MadeChange = false;
     SmallPtrSet<BasicBlock*, 8> WorkList;
-    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-      SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
-      MadeChange |= ConstantFoldTerminator(BB, true);
+    for (BasicBlock &BB : F) {
+      SmallVector<BasicBlock *, 2> Successors(succ_begin(&BB), succ_end(&BB));
+      MadeChange |= ConstantFoldTerminator(&BB, true);
       if (!MadeChange) continue;
 
       for (SmallVectorImpl<BasicBlock*>::iterator
@@ -272,6 +303,16 @@
     EverMadeChange |= MadeChange;
   }
 
+  if (!DisableGCOpts) {
+    SmallVector<Instruction *, 2> Statepoints;
+    for (BasicBlock &BB : F)
+      for (Instruction &I : BB)
+        if (isStatepoint(I))
+          Statepoints.push_back(&I);
+    for (auto &I : Statepoints)
+      EverMadeChange |= simplifyOffsetableRelocate(*I);
+  }
+
   if (ModifiedDT && DT)
     DT->recalculate(F);
 
@@ -300,7 +341,7 @@
       // Remember if SinglePred was the entry block of the function.
       // If so, we will need to move BB back to the entry position.
       bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
-      MergeBasicBlockIntoOnlyPred(BB, this);
+      MergeBasicBlockIntoOnlyPred(BB, DT);
 
       if (isEntry && BB != &BB->getParent()->getEntryBlock())
         BB->moveBefore(&BB->getParent()->getEntryBlock());
@@ -440,7 +481,7 @@
       // Remember if SinglePred was the entry block of the function.  If so, we
       // will need to move BB back to the entry position.
       bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
-      MergeBasicBlockIntoOnlyPred(DestBB, this);
+      MergeBasicBlockIntoOnlyPred(DestBB, DT);
 
       if (isEntry && BB != &BB->getParent()->getEntryBlock())
         BB->moveBefore(&BB->getParent()->getEntryBlock());
@@ -495,6 +536,144 @@
   DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
 }
 
+// Computes a map of base pointer relocation instructions to corresponding
+// derived pointer relocation instructions given a vector of all relocate calls
+static void computeBaseDerivedRelocateMap(
+    const SmallVectorImpl<User *> &AllRelocateCalls,
+    DenseMap<IntrinsicInst *, SmallVector<IntrinsicInst *, 2>> &
+        RelocateInstMap) {
+  // Collect information in two maps: one primarily for locating the base object
+  // while filling the second map; the second map is the final structure holding
+  // a mapping between Base and corresponding Derived relocate calls
+  DenseMap<std::pair<unsigned, unsigned>, IntrinsicInst *> RelocateIdxMap;
+  for (auto &U : AllRelocateCalls) {
+    GCRelocateOperands ThisRelocate(U);
+    IntrinsicInst *I = cast<IntrinsicInst>(U);
+    auto K = std::make_pair(ThisRelocate.basePtrIndex(),
+                            ThisRelocate.derivedPtrIndex());
+    RelocateIdxMap.insert(std::make_pair(K, I));
+  }
+  for (auto &Item : RelocateIdxMap) {
+    std::pair<unsigned, unsigned> Key = Item.first;
+    if (Key.first == Key.second)
+      // Base relocation: nothing to insert
+      continue;
+
+    IntrinsicInst *I = Item.second;
+    auto BaseKey = std::make_pair(Key.first, Key.first);
+    IntrinsicInst *Base = RelocateIdxMap[BaseKey];
+    if (!Base)
+      // TODO: We might want to insert a new base object relocate and gep off
+      // that, if there are enough derived object relocates.
+      continue;
+    RelocateInstMap[Base].push_back(I);
+  }
+}
+
+// Accepts a GEP and extracts the operands into a vector provided they're all
+// small integer constants
+static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
+                                          SmallVectorImpl<Value *> &OffsetV) {
+  for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
+    // Only accept small constant integer operands
+    auto Op = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (!Op || Op->getZExtValue() > 20)
+      return false;
+  }
+
+  for (unsigned i = 1; i < GEP->getNumOperands(); i++)
+    OffsetV.push_back(GEP->getOperand(i));
+  return true;
+}
+
+// Takes a RelocatedBase (base pointer relocation instruction) and Targets to
+// replace, computes a replacement, and affects it.
+static bool
+simplifyRelocatesOffABase(IntrinsicInst *RelocatedBase,
+                          const SmallVectorImpl<IntrinsicInst *> &Targets) {
+  bool MadeChange = false;
+  for (auto &ToReplace : Targets) {
+    GCRelocateOperands MasterRelocate(RelocatedBase);
+    GCRelocateOperands ThisRelocate(ToReplace);
+
+    assert(ThisRelocate.basePtrIndex() == MasterRelocate.basePtrIndex() &&
+           "Not relocating a derived object of the original base object");
+    if (ThisRelocate.basePtrIndex() == ThisRelocate.derivedPtrIndex()) {
+      // A duplicate relocate call. TODO: coalesce duplicates.
+      continue;
+    }
+
+    Value *Base = ThisRelocate.basePtr();
+    auto Derived = dyn_cast<GetElementPtrInst>(ThisRelocate.derivedPtr());
+    if (!Derived || Derived->getPointerOperand() != Base)
+      continue;
+
+    SmallVector<Value *, 2> OffsetV;
+    if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV))
+      continue;
+
+    // Create a Builder and replace the target callsite with a gep
+    IRBuilder<> Builder(ToReplace);
+    Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());
+    Value *Replacement =
+        Builder.CreateGEP(RelocatedBase, makeArrayRef(OffsetV));
+    Instruction *ReplacementInst = cast<Instruction>(Replacement);
+    ReplacementInst->removeFromParent();
+    ReplacementInst->insertAfter(RelocatedBase);
+    Replacement->takeName(ToReplace);
+    ToReplace->replaceAllUsesWith(Replacement);
+    ToReplace->eraseFromParent();
+
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+// Turns this:
+//
+// %base = ...
+// %ptr = gep %base + 15
+// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
+// %base' = relocate(%tok, i32 4, i32 4)
+// %ptr' = relocate(%tok, i32 4, i32 5)
+// %val = load %ptr'
+//
+// into this:
+//
+// %base = ...
+// %ptr = gep %base + 15
+// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
+// %base' = gc.relocate(%tok, i32 4, i32 4)
+// %ptr' = gep %base' + 15
+// %val = load %ptr'
+bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {
+  bool MadeChange = false;
+  SmallVector<User *, 2> AllRelocateCalls;
+
+  for (auto *U : I.users())
+    if (isGCRelocate(dyn_cast<Instruction>(U)))
+      // Collect all the relocate calls associated with a statepoint
+      AllRelocateCalls.push_back(U);
+
+  // We need atleast one base pointer relocation + one derived pointer
+  // relocation to mangle
+  if (AllRelocateCalls.size() < 2)
+    return false;
+
+  // RelocateInstMap is a mapping from the base relocate instruction to the
+  // corresponding derived relocate instructions
+  DenseMap<IntrinsicInst *, SmallVector<IntrinsicInst *, 2>> RelocateInstMap;
+  computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap);
+  if (RelocateInstMap.empty())
+    return false;
+
+  for (auto &Item : RelocateInstMap)
+    // Item.first is the RelocatedBase to offset against
+    // Item.second is the vector of Targets to replace
+    MadeChange = simplifyRelocatesOffABase(Item.first, Item.second);
+  return MadeChange;
+}
+
 /// SinkCast - Sink the specified cast instruction into its user blocks
 static bool SinkCast(CastInst *CI) {
   BasicBlock *DefBB = CI->getParent();
@@ -822,23 +1001,211 @@
   return MadeChange;
 }
 
-namespace {
-class CodeGenPrepareFortifiedLibCalls : public SimplifyFortifiedLibCalls {
-protected:
-  void replaceCall(Value *With) override {
-    CI->replaceAllUsesWith(With);
-    CI->eraseFromParent();
-  }
-  bool isFoldable(unsigned SizeCIOp, unsigned, bool) const override {
-      if (ConstantInt *SizeCI =
-                             dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp)))
-        return SizeCI->isAllOnesValue();
-    return false;
-  }
-};
-} // end anonymous namespace
+//  ScalarizeMaskedLoad() translates masked load intrinsic, like 
+// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
+//                               <16 x i1> %mask, <16 x i32> %passthru)
+// to a chain of basic blocks, whith loading element one-by-one if
+// the appropriate mask bit is set
+// 
+//  %1 = bitcast i8* %addr to i32*
+//  %2 = extractelement <16 x i1> %mask, i32 0
+//  %3 = icmp eq i1 %2, true
+//  br i1 %3, label %cond.load, label %else
+//
+//cond.load:                                        ; preds = %0
+//  %4 = getelementptr i32* %1, i32 0
+//  %5 = load i32* %4
+//  %6 = insertelement <16 x i32> undef, i32 %5, i32 0
+//  br label %else
+//
+//else:                                             ; preds = %0, %cond.load
+//  %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
+//  %7 = extractelement <16 x i1> %mask, i32 1
+//  %8 = icmp eq i1 %7, true
+//  br i1 %8, label %cond.load1, label %else2
+//
+//cond.load1:                                       ; preds = %else
+//  %9 = getelementptr i32* %1, i32 1
+//  %10 = load i32* %9
+//  %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
+//  br label %else2
+//
+//else2:                                            ; preds = %else, %cond.load1
+//  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+//  %12 = extractelement <16 x i1> %mask, i32 2
+//  %13 = icmp eq i1 %12, true
+//  br i1 %13, label %cond.load4, label %else5
+//
+static void ScalarizeMaskedLoad(CallInst *CI) {
+  Value *Ptr  = CI->getArgOperand(0);
+  Value *Src0 = CI->getArgOperand(3);
+  Value *Mask = CI->getArgOperand(2);
+  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
+  Type *EltTy = VecType->getElementType();
 
-bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
+  assert(VecType && "Unexpected return type of masked load intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  BasicBlock *CondBlock = nullptr;
+  BasicBlock *PrevIfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // Bitcast %addr fron i8* to EltTy*
+  Type *NewPtrType =
+    EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+  Value *UndefVal = UndefValue::get(VecType);
+
+  // The result vector
+  Value *VResult = UndefVal;
+
+  PHINode *Phi = nullptr;
+  Value *PrevPhi = UndefVal;
+
+  unsigned VectorWidth = VecType->getNumElements();
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  %to_load = icmp eq i1 %mask_1, true
+    //  br i1 %to_load, label %cond.load, label %else
+    //
+    if (Idx > 0) {
+      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+      Phi->addIncoming(VResult, CondBlock);
+      Phi->addIncoming(PrevPhi, PrevIfBlock);
+      PrevPhi = Phi;
+      VResult = Phi;
+    }
+
+    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1));
+
+    // Create "cond" block
+    //
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %Elt = load i32* %EltAddr
+    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+    //
+    CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
+    Builder.SetInsertPoint(InsertPt);
+    
+    Value* Gep = Builder.CreateInBoundsGEP(FirstEltPtr, Builder.getInt32(Idx));
+    LoadInst* Load = Builder.CreateLoad(Gep, false);
+    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+  }
+
+  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
+  Phi->addIncoming(VResult, CondBlock);
+  Phi->addIncoming(PrevPhi, PrevIfBlock);
+  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
+  CI->replaceAllUsesWith(NewI);
+  CI->eraseFromParent();
+}
+
+//  ScalarizeMaskedStore() translates masked store intrinsic, like
+// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
+//                               <16 x i1> %mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set
+//
+//   %1 = bitcast i8* %addr to i32*
+//   %2 = extractelement <16 x i1> %mask, i32 0
+//   %3 = icmp eq i1 %2, true
+//   br i1 %3, label %cond.store, label %else
+//
+// cond.store:                                       ; preds = %0
+//   %4 = extractelement <16 x i32> %val, i32 0
+//   %5 = getelementptr i32* %1, i32 0
+//   store i32 %4, i32* %5
+//   br label %else
+// 
+// else:                                             ; preds = %0, %cond.store
+//   %6 = extractelement <16 x i1> %mask, i32 1
+//   %7 = icmp eq i1 %6, true
+//   br i1 %7, label %cond.store1, label %else2
+// 
+// cond.store1:                                      ; preds = %else
+//   %8 = extractelement <16 x i32> %val, i32 1
+//   %9 = getelementptr i32* %1, i32 1
+//   store i32 %8, i32* %9
+//   br label %else2
+//   . . .
+static void ScalarizeMaskedStore(CallInst *CI) {
+  Value *Ptr  = CI->getArgOperand(1);
+  Value *Src = CI->getArgOperand(0);
+  Value *Mask = CI->getArgOperand(3);
+
+  VectorType *VecType = dyn_cast<VectorType>(Src->getType());
+  Type *EltTy = VecType->getElementType();
+
+  assert(VecType && "Unexpected data type in masked store intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // Bitcast %addr fron i8* to EltTy*
+  Type *NewPtrType =
+    EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+
+  unsigned VectorWidth = VecType->getNumElements();
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  %to_store = icmp eq i1 %mask_1, true
+    //  br i1 %to_load, label %cond.store, label %else
+    //
+    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1));
+
+    // Create "cond" block
+    //
+    //  %OneElt = extractelement <16 x i32> %Src, i32 Idx
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %store i32 %OneElt, i32* %EltAddr
+    //
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+    Builder.SetInsertPoint(InsertPt);
+    
+    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
+    Value* Gep = Builder.CreateInBoundsGEP(FirstEltPtr, Builder.getInt32(Idx));
+    Builder.CreateStore(OneElt, Gep);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    IfBlock = NewIfBlock;
+  }
+  CI->eraseFromParent();
+}
+
+bool CodeGenPrepare::OptimizeCallInst(CallInst *CI, bool& ModifiedDT) {
   BasicBlock *BB = CI->getParent();
 
   // Lower inline assembly if we can.
@@ -858,38 +1225,60 @@
       return true;
   }
 
-  // Lower all uses of llvm.objectsize.*
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
-  if (II && II->getIntrinsicID() == Intrinsic::objectsize) {
-    bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
-    Type *ReturnTy = CI->getType();
-    Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);
+  if (II) {
+    switch (II->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::objectsize: {
+      // Lower all uses of llvm.objectsize.*
+      bool Min = (cast<ConstantInt>(II->getArgOperand(1))->getZExtValue() == 1);
+      Type *ReturnTy = CI->getType();
+      Constant *RetVal = ConstantInt::get(ReturnTy, Min ? 0 : -1ULL);
 
-    // Substituting this can cause recursive simplifications, which can
-    // invalidate our iterator.  Use a WeakVH to hold onto it in case this
-    // happens.
-    WeakVH IterHandle(CurInstIterator);
+      // Substituting this can cause recursive simplifications, which can
+      // invalidate our iterator.  Use a WeakVH to hold onto it in case this
+      // happens.
+      WeakVH IterHandle(CurInstIterator);
 
-    replaceAndRecursivelySimplify(CI, RetVal,
-                                  TLI ? TLI->getDataLayout() : nullptr,
-                                  TLInfo, ModifiedDT ? nullptr : DT);
+      replaceAndRecursivelySimplify(CI, RetVal,
+                                    TLI ? TLI->getDataLayout() : nullptr,
+                                    TLInfo, ModifiedDT ? nullptr : DT);
 
-    // If the iterator instruction was recursively deleted, start over at the
-    // start of the block.
-    if (IterHandle != CurInstIterator) {
-      CurInstIterator = BB->begin();
-      SunkAddrs.clear();
+      // If the iterator instruction was recursively deleted, start over at the
+      // start of the block.
+      if (IterHandle != CurInstIterator) {
+        CurInstIterator = BB->begin();
+        SunkAddrs.clear();
+      }
+      return true;
     }
-    return true;
-  }
+    case Intrinsic::masked_load: {
+      // Scalarize unsupported vector masked load
+      if (!TTI->isLegalMaskedLoad(CI->getType(), 1)) {
+        ScalarizeMaskedLoad(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::masked_store: {
+      if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType(), 1)) {
+        ScalarizeMaskedStore(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    }
 
-  if (II && TLI) {
-    SmallVector<Value*, 2> PtrOps;
-    Type *AccessTy;
-    if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy))
-      while (!PtrOps.empty())
-        if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy))
-          return true;
+    if (TLI) {
+      SmallVector<Value*, 2> PtrOps;
+      Type *AccessTy;
+      if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy))
+        while (!PtrOps.empty())
+          if (OptimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy))
+            return true;
+    }
   }
 
   // From here on out we're working with named functions.
@@ -901,10 +1290,15 @@
 
   // Lower all default uses of _chk calls.  This is very similar
   // to what InstCombineCalls does, but here we are only lowering calls
-  // that have the default "don't know" as the objectsize.  Anything else
-  // should be left alone.
-  CodeGenPrepareFortifiedLibCalls Simplifier;
-  return Simplifier.fold(CI, TD, TLInfo);
+  // to fortified library functions (e.g. __memcpy_chk) that have the default
+  // "don't know" as the objectsize.  Anything else should be left alone.
+  FortifiedLibCallSimplifier Simplifier(TD, TLInfo, true);
+  if (Value *V = Simplifier.optimizeCall(CI)) {
+    CI->replaceAllUsesWith(V);
+    CI->eraseFromParent();
+    return true;
+  }
+  return false;
 }
 
 /// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return
@@ -1561,6 +1955,7 @@
 /// This encapsulates the logic for matching the target-legal addressing modes.
 class AddressingModeMatcher {
   SmallVectorImpl<Instruction*> &AddrModeInsts;
+  const TargetMachine &TM;
   const TargetLowering &TLI;
 
   /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
@@ -1584,13 +1979,15 @@
   /// always returns true.
   bool IgnoreProfitability;
 
-  AddressingModeMatcher(SmallVectorImpl<Instruction*> &AMI,
-                        const TargetLowering &T, Type *AT,
-                        Instruction *MI, ExtAddrMode &AM,
-                        const SetOfInstrs &InsertedTruncs,
+  AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI,
+                        const TargetMachine &TM, Type *AT, Instruction *MI,
+                        ExtAddrMode &AM, const SetOfInstrs &InsertedTruncs,
                         InstrToOrigTy &PromotedInsts,
                         TypePromotionTransaction &TPT)
-      : AddrModeInsts(AMI), TLI(T), AccessTy(AT), MemoryInst(MI), AddrMode(AM),
+      : AddrModeInsts(AMI), TM(TM),
+        TLI(*TM.getSubtargetImpl(*MI->getParent()->getParent())
+                 ->getTargetLowering()),
+        AccessTy(AT), MemoryInst(MI), AddrMode(AM),
         InsertedTruncs(InsertedTruncs), PromotedInsts(PromotedInsts), TPT(TPT) {
     IgnoreProfitability = false;
   }
@@ -1607,13 +2004,13 @@
   static ExtAddrMode Match(Value *V, Type *AccessTy,
                            Instruction *MemoryInst,
                            SmallVectorImpl<Instruction*> &AddrModeInsts,
-                           const TargetLowering &TLI,
+                           const TargetMachine &TM,
                            const SetOfInstrs &InsertedTruncs,
                            InstrToOrigTy &PromotedInsts,
                            TypePromotionTransaction &TPT) {
     ExtAddrMode Result;
 
-    bool Success = AddressingModeMatcher(AddrModeInsts, TLI, AccessTy,
+    bool Success = AddressingModeMatcher(AddrModeInsts, TM, AccessTy,
                                          MemoryInst, Result, InsertedTruncs,
                                          PromotedInsts, TPT).MatchAddr(V, 0);
     (void)Success; assert(Success && "Couldn't select *anything*?");
@@ -1718,6 +2115,23 @@
   }
 }
 
+/// \brief Check whether or not \p Val is a legal instruction for \p TLI.
+/// \note \p Val is assumed to be the product of some type promotion.
+/// Therefore if \p Val has an undefined state in \p TLI, this is assumed
+/// to be legal, as the non-promoted value would have had the same state.
+static bool isPromotedInstructionLegal(const TargetLowering &TLI, Value *Val) {
+  Instruction *PromotedInst = dyn_cast<Instruction>(Val);
+  if (!PromotedInst)
+    return false;
+  int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode());
+  // If the ISDOpcode is undefined, it was undefined before the promotion.
+  if (!ISDOpcode)
+    return true;
+  // Otherwise, check if the promoted instruction is legal or not.
+  return TLI.isOperationLegalOrCustom(
+      ISDOpcode, TLI.getValueType(PromotedInst->getType()));
+}
+
 /// \brief Hepler class to perform type promotion.
 class TypePromotionHelper {
   /// \brief Utility function to check whether or not a sign or zero extension
@@ -1747,46 +2161,59 @@
   /// \p PromotedInsts maps the instructions to their type before promotion.
   /// \p CreatedInsts[out] contains how many non-free instructions have been
   /// created to promote the operand of Ext.
+  /// Newly added extensions are inserted in \p Exts.
+  /// Newly added truncates are inserted in \p Truncs.
   /// Should never be called directly.
   /// \return The promoted value which is used instead of Ext.
-  static Value *promoteOperandForTruncAndAnyExt(Instruction *Ext,
-                                                TypePromotionTransaction &TPT,
-                                                InstrToOrigTy &PromotedInsts,
-                                                unsigned &CreatedInsts);
+  static Value *promoteOperandForTruncAndAnyExt(
+      Instruction *Ext, TypePromotionTransaction &TPT,
+      InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+      SmallVectorImpl<Instruction *> *Exts,
+      SmallVectorImpl<Instruction *> *Truncs);
 
   /// \brief Utility function to promote the operand of \p Ext when this
   /// operand is promotable and is not a supported trunc or sext.
   /// \p PromotedInsts maps the instructions to their type before promotion.
   /// \p CreatedInsts[out] contains how many non-free instructions have been
   /// created to promote the operand of Ext.
+  /// Newly added extensions are inserted in \p Exts.
+  /// Newly added truncates are inserted in \p Truncs.
   /// Should never be called directly.
   /// \return The promoted value which is used instead of Ext.
-  static Value *promoteOperandForOther(Instruction *Ext,
-                                       TypePromotionTransaction &TPT,
-                                       InstrToOrigTy &PromotedInsts,
-                                       unsigned &CreatedInsts, bool IsSExt);
+  static Value *
+  promoteOperandForOther(Instruction *Ext, TypePromotionTransaction &TPT,
+                         InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+                         SmallVectorImpl<Instruction *> *Exts,
+                         SmallVectorImpl<Instruction *> *Truncs, bool IsSExt);
 
   /// \see promoteOperandForOther.
-  static Value *signExtendOperandForOther(Instruction *Ext,
-                                          TypePromotionTransaction &TPT,
-                                          InstrToOrigTy &PromotedInsts,
-                                          unsigned &CreatedInsts) {
-    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, true);
+  static Value *
+  signExtendOperandForOther(Instruction *Ext, TypePromotionTransaction &TPT,
+                            InstrToOrigTy &PromotedInsts,
+                            unsigned &CreatedInsts,
+                            SmallVectorImpl<Instruction *> *Exts,
+                            SmallVectorImpl<Instruction *> *Truncs) {
+    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, Exts,
+                                  Truncs, true);
   }
 
   /// \see promoteOperandForOther.
-  static Value *zeroExtendOperandForOther(Instruction *Ext,
-                                          TypePromotionTransaction &TPT,
-                                          InstrToOrigTy &PromotedInsts,
-                                          unsigned &CreatedInsts) {
-    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, false);
+  static Value *
+  zeroExtendOperandForOther(Instruction *Ext, TypePromotionTransaction &TPT,
+                            InstrToOrigTy &PromotedInsts,
+                            unsigned &CreatedInsts,
+                            SmallVectorImpl<Instruction *> *Exts,
+                            SmallVectorImpl<Instruction *> *Truncs) {
+    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInsts, Exts,
+                                  Truncs, false);
   }
 
 public:
   /// Type for the utility function that promotes the operand of Ext.
   typedef Value *(*Action)(Instruction *Ext, TypePromotionTransaction &TPT,
-                           InstrToOrigTy &PromotedInsts,
-                           unsigned &CreatedInsts);
+                           InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+                           SmallVectorImpl<Instruction *> *Exts,
+                           SmallVectorImpl<Instruction *> *Truncs);
   /// \brief Given a sign/zero extend instruction \p Ext, return the approriate
   /// action to promote the operand of \p Ext instead of using Ext.
   /// \return NULL if no promotable action is possible with the current
@@ -1805,6 +2232,12 @@
                                         Type *ConsideredExtType,
                                         const InstrToOrigTy &PromotedInsts,
                                         bool IsSExt) {
+  // The promotion helper does not know how to deal with vector types yet.
+  // To be able to fix that, we would need to fix the places where we
+  // statically extend, e.g., constants and such.
+  if (Inst->getType()->isVectorTy())
+    return false;
+
   // We can always get through zext.
   if (isa<ZExtInst>(Inst))
     return true;
@@ -1830,8 +2263,9 @@
   // Check if we can use this operand in the extension.
   // If the type is larger than the result type of the extension,
   // we cannot.
-  if (OpndVal->getType()->getIntegerBitWidth() >
-      ConsideredExtType->getIntegerBitWidth())
+  if (!OpndVal->getType()->isIntegerTy() ||
+      OpndVal->getType()->getIntegerBitWidth() >
+          ConsideredExtType->getIntegerBitWidth())
     return false;
 
   // If the operand of the truncate is not an instruction, we will not have
@@ -1896,7 +2330,9 @@
 
 Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
     llvm::Instruction *SExt, TypePromotionTransaction &TPT,
-    InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts) {
+    InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+    SmallVectorImpl<Instruction *> *Exts,
+    SmallVectorImpl<Instruction *> *Truncs) {
   // By construction, the operand of SExt is an instruction. Otherwise we cannot
   // get through it and this method should not be called.
   Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0));
@@ -1922,8 +2358,11 @@
 
   // Check if the extension is still needed.
   Instruction *ExtInst = dyn_cast<Instruction>(ExtVal);
-  if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType())
+  if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) {
+    if (ExtInst && Exts)
+      Exts->push_back(ExtInst);
     return ExtVal;
+  }
 
   // At this point we have: ext ty opnd to ty.
   // Reassign the uses of ExtInst to the opnd and remove ExtInst.
@@ -1934,7 +2373,9 @@
 
 Value *TypePromotionHelper::promoteOperandForOther(
     Instruction *Ext, TypePromotionTransaction &TPT,
-    InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts, bool IsSExt) {
+    InstrToOrigTy &PromotedInsts, unsigned &CreatedInsts,
+    SmallVectorImpl<Instruction *> *Exts,
+    SmallVectorImpl<Instruction *> *Truncs, bool IsSExt) {
   // By construction, the operand of Ext is an instruction. Otherwise we cannot
   // get through it and this method should not be called.
   Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0));
@@ -1949,6 +2390,8 @@
       ITrunc->removeFromParent();
       // Insert it just after the definition.
       ITrunc->insertAfter(ExtOpnd);
+      if (Truncs)
+        Truncs->push_back(ITrunc);
     }
 
     TPT.replaceAllUsesWith(ExtOpnd, Trunc);
@@ -2004,12 +2447,17 @@
     if (!ExtForOpnd) {
       // If yes, create a new one.
       DEBUG(dbgs() << "More operands to ext\n");
-      ExtForOpnd =
-          cast<Instruction>(IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType())
-                                   : TPT.createZExt(Ext, Opnd, Ext->getType()));
+      Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType())
+        : TPT.createZExt(Ext, Opnd, Ext->getType());
+      if (!isa<Instruction>(ValForExtOpnd)) {
+        TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd);
+        continue;
+      }
+      ExtForOpnd = cast<Instruction>(ValForExtOpnd);
       ++CreatedInsts;
     }
-
+    if (Exts)
+      Exts->push_back(ExtForOpnd);
     TPT.setOperand(ExtForOpnd, 0, Opnd);
 
     // Move the sign extension before the insertion point.
@@ -2047,16 +2495,7 @@
   // The promotion is neutral but it may help folding the sign extension in
   // loads for instance.
   // Check that we did not create an illegal instruction.
-  Instruction *PromotedInst = dyn_cast<Instruction>(PromotedOperand);
-  if (!PromotedInst)
-    return false;
-  int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode());
-  // If the ISDOpcode is undefined, it was undefined before the promotion.
-  if (!ISDOpcode)
-    return true;
-  // Otherwise, check if the promoted instruction is legal or not.
-  return TLI.isOperationLegalOrCustom(
-      ISDOpcode, TLI.getValueType(PromotedInst->getType()));
+  return isPromotedInstructionLegal(TLI, PromotedOperand);
 }
 
 /// MatchOperationAddr - Given an instruction or constant expr, see if we can
@@ -2250,7 +2689,8 @@
     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
         TPT.getRestorationPoint();
     unsigned CreatedInsts = 0;
-    Value *PromotedOperand = TPH(Ext, TPT, PromotedInsts, CreatedInsts);
+    Value *PromotedOperand =
+        TPH(Ext, TPT, PromotedInsts, CreatedInsts, nullptr, nullptr);
     // SExt has been moved away.
     // Thus either it will be rematched later in the recursive calls or it is
     // gone. Anyway, we must not fold it into the addressing mode at this point.
@@ -2374,13 +2814,17 @@
 /// inline asm call are due to memory operands.  If so, return true, otherwise
 /// return false.
 static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
-                                    const TargetLowering &TLI) {
-  TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI));
+                                    const TargetMachine &TM) {
+  const Function *F = CI->getParent()->getParent();
+  const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering();
+  const TargetRegisterInfo *TRI = TM.getSubtargetImpl(*F)->getRegisterInfo();
+  TargetLowering::AsmOperandInfoVector TargetConstraints =
+      TLI->ParseConstraints(TRI, ImmutableCallSite(CI));
   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
 
     // Compute the constraint code and ConstraintType to use.
-    TLI.ComputeConstraintToUse(OpInfo, SDValue());
+    TLI->ComputeConstraintToUse(OpInfo, SDValue());
 
     // If this asm operand is our Value*, and if it isn't an indirect memory
     // operand, we can't fold it!
@@ -2396,10 +2840,10 @@
 /// FindAllMemoryUses - Recursively walk all the uses of I until we find a
 /// memory use.  If we find an obviously non-foldable instruction, return true.
 /// Add the ultimately found memory instructions to MemoryUses.
-static bool FindAllMemoryUses(Instruction *I,
-                SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses,
-                              SmallPtrSetImpl<Instruction*> &ConsideredInsts,
-                              const TargetLowering &TLI) {
+static bool FindAllMemoryUses(
+    Instruction *I,
+    SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses,
+    SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetMachine &TM) {
   // If we already considered this instruction, we're done.
   if (!ConsideredInsts.insert(I).second)
     return false;
@@ -2429,12 +2873,12 @@
       if (!IA) return true;
 
       // If this is a memory operand, we're cool, otherwise bail out.
-      if (!IsOperandAMemoryOperand(CI, IA, I, TLI))
+      if (!IsOperandAMemoryOperand(CI, IA, I, TM))
         return true;
       continue;
     }
 
-    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI))
+    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TM))
       return true;
   }
 
@@ -2522,7 +2966,7 @@
   // uses.
   SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
   SmallPtrSet<Instruction*, 16> ConsideredInsts;
-  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI))
+  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM))
     return false;  // Has a non-memory, non-foldable use!
 
   // Now that we know that all uses of this instruction are part of a chain of
@@ -2547,7 +2991,7 @@
     ExtAddrMode Result;
     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
         TPT.getRestorationPoint();
-    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, AddressAccessTy,
+    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TM, AddressAccessTy,
                                   MemoryInst, Result, InsertedTruncs,
                                   PromotedInsts, TPT);
     Matcher.IgnoreProfitability = true;
@@ -2630,7 +3074,7 @@
     // For non-PHIs, determine the addressing mode being computed.
     SmallVector<Instruction*, 16> NewAddrModeInsts;
     ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
-        V, AccessTy, MemoryInst, NewAddrModeInsts, *TLI, InsertedTruncsSet,
+        V, AccessTy, MemoryInst, NewAddrModeInsts, *TM, InsertedTruncsSet,
         PromotedInsts, TPT);
 
     // This check is broken into two cases with very similar code to avoid using
@@ -2705,8 +3149,10 @@
                  << *MemoryInst << "\n");
     if (SunkAddr->getType() != Addr->getType())
       SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
-  } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() &&
-               TM && TM->getSubtarget<TargetSubtargetInfo>().useAA())) {
+  } else if (AddrSinkUsingGEPs ||
+             (!AddrSinkUsingGEPs.getNumOccurrences() && TM &&
+              TM->getSubtargetImpl(*MemoryInst->getParent()->getParent())
+                  ->useAA())) {
     // By default, we use the GEP-based method when AA is used later. This
     // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
@@ -2929,8 +3375,10 @@
 bool CodeGenPrepare::OptimizeInlineAsmInst(CallInst *CS) {
   bool MadeChange = false;
 
+  const TargetRegisterInfo *TRI =
+      TM->getSubtargetImpl(*CS->getParent()->getParent())->getRegisterInfo();
   TargetLowering::AsmOperandInfoVector
-    TargetConstraints = TLI->ParseConstraints(CS);
+    TargetConstraints = TLI->ParseConstraints(TRI, CS);
   unsigned ArgNo = 0;
   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
@@ -2949,26 +3397,186 @@
   return MadeChange;
 }
 
+/// \brief Check if all the uses of \p Inst are equivalent (or free) zero or
+/// sign extensions.
+static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {
+  assert(!Inst->use_empty() && "Input must have at least one use");
+  const Instruction *FirstUser = cast<Instruction>(*Inst->user_begin());
+  bool IsSExt = isa<SExtInst>(FirstUser);
+  Type *ExtTy = FirstUser->getType();
+  for (const User *U : Inst->users()) {
+    const Instruction *UI = cast<Instruction>(U);
+    if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI)))
+      return false;
+    Type *CurTy = UI->getType();
+    // Same input and output types: Same instruction after CSE.
+    if (CurTy == ExtTy)
+      continue;
+
+    // If IsSExt is true, we are in this situation:
+    // a = Inst
+    // b = sext ty1 a to ty2
+    // c = sext ty1 a to ty3
+    // Assuming ty2 is shorter than ty3, this could be turned into:
+    // a = Inst
+    // b = sext ty1 a to ty2
+    // c = sext ty2 b to ty3
+    // However, the last sext is not free.
+    if (IsSExt)
+      return false;
+
+    // This is a ZExt, maybe this is free to extend from one type to another.
+    // In that case, we would not account for a different use.
+    Type *NarrowTy;
+    Type *LargeTy;
+    if (ExtTy->getScalarType()->getIntegerBitWidth() >
+        CurTy->getScalarType()->getIntegerBitWidth()) {
+      NarrowTy = CurTy;
+      LargeTy = ExtTy;
+    } else {
+      NarrowTy = ExtTy;
+      LargeTy = CurTy;
+    }
+
+    if (!TLI.isZExtFree(NarrowTy, LargeTy))
+      return false;
+  }
+  // All uses are the same or can be derived from one another for free.
+  return true;
+}
+
+/// \brief Try to form ExtLd by promoting \p Exts until they reach a
+/// load instruction.
+/// If an ext(load) can be formed, it is returned via \p LI for the load
+/// and \p Inst for the extension.
+/// Otherwise LI == nullptr and Inst == nullptr.
+/// When some promotion happened, \p TPT contains the proper state to
+/// revert them.
+///
+/// \return true when promoting was necessary to expose the ext(load)
+/// opportunity, false otherwise.
+///
+/// Example:
+/// \code
+/// %ld = load i32* %addr
+/// %add = add nuw i32 %ld, 4
+/// %zext = zext i32 %add to i64
+/// \endcode
+/// =>
+/// \code
+/// %ld = load i32* %addr
+/// %zext = zext i32 %ld to i64
+/// %add = add nuw i64 %zext, 4
+/// \encode
+/// Thanks to the promotion, we can match zext(load i32*) to i64.
+bool CodeGenPrepare::ExtLdPromotion(TypePromotionTransaction &TPT,
+                                    LoadInst *&LI, Instruction *&Inst,
+                                    const SmallVectorImpl<Instruction *> &Exts,
+                                    unsigned CreatedInsts = 0) {
+  // Iterate over all the extensions to see if one form an ext(load).
+  for (auto I : Exts) {
+    // Check if we directly have ext(load).
+    if ((LI = dyn_cast<LoadInst>(I->getOperand(0)))) {
+      Inst = I;
+      // No promotion happened here.
+      return false;
+    }
+    // Check whether or not we want to do any promotion.
+    if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion)
+      continue;
+    // Get the action to perform the promotion.
+    TypePromotionHelper::Action TPH = TypePromotionHelper::getAction(
+        I, InsertedTruncsSet, *TLI, PromotedInsts);
+    // Check if we can promote.
+    if (!TPH)
+      continue;
+    // Save the current state.
+    TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+        TPT.getRestorationPoint();
+    SmallVector<Instruction *, 4> NewExts;
+    unsigned NewCreatedInsts = 0;
+    // Promote.
+    Value *PromotedVal =
+        TPH(I, TPT, PromotedInsts, NewCreatedInsts, &NewExts, nullptr);
+    assert(PromotedVal &&
+           "TypePromotionHelper should have filtered out those cases");
+
+    // We would be able to merge only one extension in a load.
+    // Therefore, if we have more than 1 new extension we heuristically
+    // cut this search path, because it means we degrade the code quality.
+    // With exactly 2, the transformation is neutral, because we will merge
+    // one extension but leave one. However, we optimistically keep going,
+    // because the new extension may be removed too.
+    unsigned TotalCreatedInsts = CreatedInsts + NewCreatedInsts;
+    if (!StressExtLdPromotion &&
+        (TotalCreatedInsts > 1 ||
+         !isPromotedInstructionLegal(*TLI, PromotedVal))) {
+      // The promotion is not profitable, rollback to the previous state.
+      TPT.rollback(LastKnownGood);
+      continue;
+    }
+    // The promotion is profitable.
+    // Check if it exposes an ext(load).
+    (void)ExtLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInsts);
+    if (LI && (StressExtLdPromotion || NewCreatedInsts == 0 ||
+               // If we have created a new extension, i.e., now we have two
+               // extensions. We must make sure one of them is merged with
+               // the load, otherwise we may degrade the code quality.
+               (LI->hasOneUse() || hasSameExtUse(LI, *TLI))))
+      // Promotion happened.
+      return true;
+    // If this does not help to expose an ext(load) then, rollback.
+    TPT.rollback(LastKnownGood);
+  }
+  // None of the extension can form an ext(load).
+  LI = nullptr;
+  Inst = nullptr;
+  return false;
+}
+
 /// MoveExtToFormExtLoad - Move a zext or sext fed by a load into the same
 /// basic block as the load, unless conditions are unfavorable. This allows
 /// SelectionDAG to fold the extend into the load.
+/// \p I[in/out] the extension may be modified during the process if some
+/// promotions apply.
 ///
-bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *I) {
+bool CodeGenPrepare::MoveExtToFormExtLoad(Instruction *&I) {
+  // Try to promote a chain of computation if it allows to form
+  // an extended load.
+  TypePromotionTransaction TPT;
+  TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+    TPT.getRestorationPoint();
+  SmallVector<Instruction *, 1> Exts;
+  Exts.push_back(I);
   // Look for a load being extended.
-  LoadInst *LI = dyn_cast<LoadInst>(I->getOperand(0));
-  if (!LI) return false;
+  LoadInst *LI = nullptr;
+  Instruction *OldExt = I;
+  bool HasPromoted = ExtLdPromotion(TPT, LI, I, Exts);
+  if (!LI || !I) {
+    assert(!HasPromoted && !LI && "If we did not match any load instruction "
+                                  "the code must remain the same");
+    I = OldExt;
+    return false;
+  }
 
   // If they're already in the same block, there's nothing to do.
-  if (LI->getParent() == I->getParent())
+  // Make the cheap checks first if we did not promote.
+  // If we promoted, we need to check if it is indeed profitable.
+  if (!HasPromoted && LI->getParent() == I->getParent())
     return false;
 
+  EVT VT = TLI->getValueType(I->getType());
+  EVT LoadVT = TLI->getValueType(LI->getType());
+
   // If the load has other users and the truncate is not free, this probably
   // isn't worthwhile.
-  if (!LI->hasOneUse() &&
-      TLI && (TLI->isTypeLegal(TLI->getValueType(LI->getType())) ||
-              !TLI->isTypeLegal(TLI->getValueType(I->getType()))) &&
-      !TLI->isTruncateFree(I->getType(), LI->getType()))
+  if (!LI->hasOneUse() && TLI &&
+      (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) &&
+      !TLI->isTruncateFree(I->getType(), LI->getType())) {
+    I = OldExt;
+    TPT.rollback(LastKnownGood);
     return false;
+  }
 
   // Check whether the target supports casts folded into loads.
   unsigned LType;
@@ -2978,11 +3586,15 @@
     assert(isa<SExtInst>(I) && "Unexpected ext type!");
     LType = ISD::SEXTLOAD;
   }
-  if (TLI && !TLI->isLoadExtLegal(LType, TLI->getValueType(LI->getType())))
+  if (TLI && !TLI->isLoadExtLegal(LType, VT, LoadVT)) {
+    I = OldExt;
+    TPT.rollback(LastKnownGood);
     return false;
+  }
 
   // Move the extend into the same block as the load, so that SelectionDAG
   // can fold it.
+  TPT.commit();
   I->removeFromParent();
   I->insertAfter(LI);
   ++NumExtsMoved;
@@ -3512,7 +4124,8 @@
           isa<UndefValue>(Val) ||
               canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()));
     } else
-      assert(0 && "Did you modified shouldPromote and forgot to update this?");
+      llvm_unreachable("Did you modified shouldPromote and forgot to update "
+                       "this?");
     ToBePromoted->setOperand(U.getOperandNo(), NewVal);
   }
   Transition->removeFromParent();
@@ -3575,7 +4188,7 @@
   return false;
 }
 
-bool CodeGenPrepare::OptimizeInst(Instruction *I) {
+bool CodeGenPrepare::OptimizeInst(Instruction *I, bool& ModifiedDT) {
   if (PHINode *P = dyn_cast<PHINode>(I)) {
     // It is possible for very late stage optimizations (such as SimplifyCFG)
     // to introduce PHI nodes too late to be cleaned up.  If we detect such a
@@ -3654,14 +4267,14 @@
       GEPI->replaceAllUsesWith(NC);
       GEPI->eraseFromParent();
       ++NumGEPsElim;
-      OptimizeInst(NC);
+      OptimizeInst(NC, ModifiedDT);
       return true;
     }
     return false;
   }
 
   if (CallInst *CI = dyn_cast<CallInst>(I))
-    return OptimizeCallInst(CI);
+    return OptimizeCallInst(CI, ModifiedDT);
 
   if (SelectInst *SI = dyn_cast<SelectInst>(I))
     return OptimizeSelectInst(SI);
@@ -3678,14 +4291,16 @@
 // In this pass we look for GEP and cast instructions that are used
 // across basic blocks and rewrite them to improve basic-block-at-a-time
 // selection.
-bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
+bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
   SunkAddrs.clear();
   bool MadeChange = false;
 
   CurInstIterator = BB.begin();
-  while (CurInstIterator != BB.end())
-    MadeChange |= OptimizeInst(CurInstIterator++);
-
+  while (CurInstIterator != BB.end()) {
+    MadeChange |= OptimizeInst(CurInstIterator++, ModifiedDT);
+    if (ModifiedDT)
+      return true;
+  }
   MadeChange |= DupRetToEnableTailCallOpts(&BB);
 
   return MadeChange;
@@ -3696,10 +4311,10 @@
 // find a node corresponding to the value.
 bool CodeGenPrepare::PlaceDbgValues(Function &F) {
   bool MadeChange = false;
-  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+  for (BasicBlock &BB : F) {
     Instruction *PrevNonDbgInst = nullptr;
-    for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE;) {
-      Instruction *Insn = BI; ++BI;
+    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
+      Instruction *Insn = BI++;
       DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
       // Leave dbg.values that refer to an alloca alone. These
       // instrinsics describe the address of a variable (= the alloca)
@@ -3793,3 +4408,233 @@
   }
   return MadeChange;
 }
+
+/// \brief Retrieve the probabilities of a conditional branch. Returns true on
+/// success, or returns false if no or invalid metadata was found.
+static bool extractBranchMetadata(BranchInst *BI,
+                                  uint64_t &ProbTrue, uint64_t &ProbFalse) {
+  assert(BI->isConditional() &&
+         "Looking for probabilities on unconditional branch?");
+  auto *ProfileData = BI->getMetadata(LLVMContext::MD_prof);
+  if (!ProfileData || ProfileData->getNumOperands() != 3)
+    return false;
+
+  const auto *CITrue =
+      mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(1));
+  const auto *CIFalse =
+      mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(2));
+  if (!CITrue || !CIFalse)
+    return false;
+
+  ProbTrue = CITrue->getValue().getZExtValue();
+  ProbFalse = CIFalse->getValue().getZExtValue();
+
+  return true;
+}
+
+/// \brief Scale down both weights to fit into uint32_t.
+static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
+  uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
+  uint32_t Scale = (NewMax / UINT32_MAX) + 1;
+  NewTrue = NewTrue / Scale;
+  NewFalse = NewFalse / Scale;
+}
+
+/// \brief Some targets prefer to split a conditional branch like:
+/// \code
+///   %0 = icmp ne i32 %a, 0
+///   %1 = icmp ne i32 %b, 0
+///   %or.cond = or i1 %0, %1
+///   br i1 %or.cond, label %TrueBB, label %FalseBB
+/// \endcode
+/// into multiple branch instructions like:
+/// \code
+///   bb1:
+///     %0 = icmp ne i32 %a, 0
+///     br i1 %0, label %TrueBB, label %bb2
+///   bb2:
+///     %1 = icmp ne i32 %b, 0
+///     br i1 %1, label %TrueBB, label %FalseBB
+/// \endcode
+/// This usually allows instruction selection to do even further optimizations
+/// and combine the compare with the branch instruction. Currently this is
+/// applied for targets which have "cheap" jump instructions.
+///
+/// FIXME: Remove the (equivalent?) implementation in SelectionDAG.
+///
+bool CodeGenPrepare::splitBranchCondition(Function &F) {
+  if (!TM || TM->Options.EnableFastISel != true ||
+      !TLI || TLI->isJumpExpensive())
+    return false;
+
+  bool MadeChange = false;
+  for (auto &BB : F) {
+    // Does this BB end with the following?
+    //   %cond1 = icmp|fcmp|binary instruction ...
+    //   %cond2 = icmp|fcmp|binary instruction ...
+    //   %cond.or = or|and i1 %cond1, cond2
+    //   br i1 %cond.or label %dest1, label %dest2"
+    BinaryOperator *LogicOp;
+    BasicBlock *TBB, *FBB;
+    if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB)))
+      continue;
+
+    unsigned Opc;
+    Value *Cond1, *Cond2;
+    if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)),
+                             m_OneUse(m_Value(Cond2)))))
+      Opc = Instruction::And;
+    else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)),
+                                 m_OneUse(m_Value(Cond2)))))
+      Opc = Instruction::Or;
+    else
+      continue;
+
+    if (!match(Cond1, m_CombineOr(m_Cmp(), m_BinOp())) ||
+        !match(Cond2, m_CombineOr(m_Cmp(), m_BinOp()))   )
+      continue;
+
+    DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());
+
+    // Create a new BB.
+    auto *InsertBefore = std::next(Function::iterator(BB))
+        .getNodePtrUnchecked();
+    auto TmpBB = BasicBlock::Create(BB.getContext(),
+                                    BB.getName() + ".cond.split",
+                                    BB.getParent(), InsertBefore);
+
+    // Update original basic block by using the first condition directly by the
+    // branch instruction and removing the no longer needed and/or instruction.
+    auto *Br1 = cast<BranchInst>(BB.getTerminator());
+    Br1->setCondition(Cond1);
+    LogicOp->eraseFromParent();
+
+    // Depending on the conditon we have to either replace the true or the false
+    // successor of the original branch instruction.
+    if (Opc == Instruction::And)
+      Br1->setSuccessor(0, TmpBB);
+    else
+      Br1->setSuccessor(1, TmpBB);
+
+    // Fill in the new basic block.
+    auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB);
+    if (auto *I = dyn_cast<Instruction>(Cond2)) {
+      I->removeFromParent();
+      I->insertBefore(Br2);
+    }
+
+    // Update PHI nodes in both successors. The original BB needs to be
+    // replaced in one succesor's PHI nodes, because the branch comes now from
+    // the newly generated BB (NewBB). In the other successor we need to add one
+    // incoming edge to the PHI nodes, because both branch instructions target
+    // now the same successor. Depending on the original branch condition
+    // (and/or) we have to swap the successors (TrueDest, FalseDest), so that
+    // we perfrom the correct update for the PHI nodes.
+    // This doesn't change the successor order of the just created branch
+    // instruction (or any other instruction).
+    if (Opc == Instruction::Or)
+      std::swap(TBB, FBB);
+
+    // Replace the old BB with the new BB.
+    for (auto &I : *TBB) {
+      PHINode *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        break;
+      int i;
+      while ((i = PN->getBasicBlockIndex(&BB)) >= 0)
+        PN->setIncomingBlock(i, TmpBB);
+    }
+
+    // Add another incoming edge form the new BB.
+    for (auto &I : *FBB) {
+      PHINode *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        break;
+      auto *Val = PN->getIncomingValueForBlock(&BB);
+      PN->addIncoming(Val, TmpBB);
+    }
+
+    // Update the branch weights (from SelectionDAGBuilder::
+    // FindMergedConditions).
+    if (Opc == Instruction::Or) {
+      // Codegen X | Y as:
+      // BB1:
+      //   jmp_if_X TBB
+      //   jmp TmpBB
+      // TmpBB:
+      //   jmp_if_Y TBB
+      //   jmp FBB
+      //
+
+      // We have flexibility in setting Prob for BB1 and Prob for NewBB.
+      // The requirement is that
+      //   TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
+      //     = TrueProb for orignal BB.
+      // Assuming the orignal weights are A and B, one choice is to set BB1's
+      // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice
+      // assumes that
+      //   TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
+      // Another choice is to assume TrueProb for BB1 equals to TrueProb for
+      // TmpBB, but the math is more complicated.
+      uint64_t TrueWeight, FalseWeight;
+      if (extractBranchMetadata(Br1, TrueWeight, FalseWeight)) {
+        uint64_t NewTrueWeight = TrueWeight;
+        uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
+                         .createBranchWeights(TrueWeight, FalseWeight));
+
+        NewTrueWeight = TrueWeight;
+        NewFalseWeight = 2 * FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
+                         .createBranchWeights(TrueWeight, FalseWeight));
+      }
+    } else {
+      // Codegen X & Y as:
+      // BB1:
+      //   jmp_if_X TmpBB
+      //   jmp FBB
+      // TmpBB:
+      //   jmp_if_Y TBB
+      //   jmp FBB
+      //
+      //  This requires creation of TmpBB after CurBB.
+
+      // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
+      // The requirement is that
+      //   FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
+      //     = FalseProb for orignal BB.
+      // Assuming the orignal weights are A and B, one choice is to set BB1's
+      // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice
+      // assumes that
+      //   FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB.
+      uint64_t TrueWeight, FalseWeight;
+      if (extractBranchMetadata(Br1, TrueWeight, FalseWeight)) {
+        uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight;
+        uint64_t NewFalseWeight = FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
+                         .createBranchWeights(TrueWeight, FalseWeight));
+
+        NewTrueWeight = 2 * TrueWeight;
+        NewFalseWeight = FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
+                         .createBranchWeights(TrueWeight, FalseWeight));
+      }
+    }
+
+    // Request DOM Tree update.
+    // Note: No point in getting fancy here, since the DT info is never
+    // available to CodeGenPrepare and the existing update code is broken
+    // anyways.
+    ModifiedDT = true;
+
+    MadeChange = true;
+
+    DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump();
+          TmpBB->dump());
+  }
+  return MadeChange;
+}

diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index 48213c1..c17a35d 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp

@@ -59,6 +59,10 @@
   if (MI->isInlineAsm())
     return false;
 
+  // Don't delete frame allocation labels.
+  if (MI->getOpcode() == TargetOpcode::FRAME_ALLOC)
+    return false;
+
   // Don't delete instructions with side effects.
   bool SawStore = false;
   if (!MI->isSafeToMove(TII, nullptr, SawStore) && !MI->isPHI())

diff --git a/lib/CodeGen/DwarfEHPrepare.cpp b/lib/CodeGen/DwarfEHPrepare.cpp
index 75b74d9..7b47a48 100644
--- a/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/lib/CodeGen/DwarfEHPrepare.cpp

@@ -14,18 +14,12 @@
 
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/IR/CallSite.h"
-#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "dwarfehprepare"
@@ -44,10 +38,13 @@
 
   public:
     static char ID; // Pass identification, replacement for typeid.
+
+    // INITIALIZE_TM_PASS requires a default constructor, but it isn't used in
+    // practice.
+    DwarfEHPrepare() : FunctionPass(ID), TM(nullptr), RewindFunction(nullptr) {}
+
     DwarfEHPrepare(const TargetMachine *TM)
-        : FunctionPass(ID), TM(TM), RewindFunction(nullptr) {
-      initializeDominatorTreeWrapperPassPass(*PassRegistry::getPassRegistry());
-    }
+        : FunctionPass(ID), TM(TM), RewindFunction(nullptr) {}
 
     bool runOnFunction(Function &Fn) override;
 
@@ -56,8 +53,6 @@
       return false;
     }
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override { }
-
     const char *getPassName() const override {
       return "Exception handling preparation";
     }
@@ -65,6 +60,8 @@
 } // end anonymous namespace
 
 char DwarfEHPrepare::ID = 0;
+INITIALIZE_TM_PASS(DwarfEHPrepare, "dwarfehprepare", "Prepare DWARF exceptions",
+                   false, false)
 
 FunctionPass *llvm::createDwarfEHPass(const TargetMachine *TM) {
   return new DwarfEHPrepare(TM);
@@ -99,11 +96,11 @@
   RI->eraseFromParent();
 
   if (EraseIVIs) {
-    if (SelIVI->getNumUses() == 0)
+    if (SelIVI->use_empty())
       SelIVI->eraseFromParent();
-    if (ExcIVI->getNumUses() == 0)
+    if (ExcIVI->use_empty())
       ExcIVI->eraseFromParent();
-    if (SelLoad && SelLoad->getNumUses() == 0)
+    if (SelLoad && SelLoad->use_empty())
       SelLoad->eraseFromParent();
   }
 
@@ -114,9 +111,8 @@
 /// into calls to the appropriate _Unwind_Resume function.
 bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
   SmallVector<ResumeInst*, 16> Resumes;
-  for (Function::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I) {
-    TerminatorInst *TI = I->getTerminator();
-    if (ResumeInst *RI = dyn_cast<ResumeInst>(TI))
+  for (BasicBlock &BB : Fn) {
+    if (auto *RI = dyn_cast<ResumeInst>(BB.getTerminator()))
       Resumes.push_back(RI);
   }
 
@@ -124,9 +120,9 @@
     return false;
 
   // Find the rewind function if we didn't already.
-  const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
+  const TargetLowering *TLI = TM->getSubtargetImpl(Fn)->getTargetLowering();
+  LLVMContext &Ctx = Fn.getContext();
   if (!RewindFunction) {
-    LLVMContext &Ctx = Resumes[0]->getContext();
     FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx),
                                           Type::getInt8PtrTy(Ctx), false);
     const char *RewindName = TLI->getLibcallName(RTLIB::UNWIND_RESUME);
@@ -134,7 +130,6 @@
   }
 
   // Create the basic block where the _Unwind_Resume call will live.
-  LLVMContext &Ctx = Fn.getContext();
   unsigned ResumesSize = Resumes.size();
 
   if (ResumesSize == 1) {
@@ -159,9 +154,7 @@
 
   // Extract the exception object from the ResumeInst and add it to the PHI node
   // that feeds the _Unwind_Resume call.
-  for (SmallVectorImpl<ResumeInst*>::iterator
-         I = Resumes.begin(), E = Resumes.end(); I != E; ++I) {
-    ResumeInst *RI = *I;
+  for (ResumeInst *RI : Resumes) {
     BasicBlock *Parent = RI->getParent();
     BranchInst::Create(UnwindBB, Parent);
 
@@ -181,6 +174,7 @@
 }
 
 bool DwarfEHPrepare::runOnFunction(Function &Fn) {
+  assert(TM && "DWARF EH preparation requires a target machine");
   bool Changed = InsertUnwindResumeCalls(Fn);
   return Changed;
 }

diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
index 995606f..8f74271 100644
--- a/lib/CodeGen/EarlyIfConversion.cpp
+++ b/lib/CodeGen/EarlyIfConversion.cpp

@@ -777,15 +777,13 @@
   DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n"
                << "********** Function: " << MF.getName() << '\n');
   // Only run if conversion if the target wants it.
-  if (!MF.getTarget()
-           .getSubtarget<TargetSubtargetInfo>()
-           .enableEarlyIfConversion())
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  if (!STI.enableEarlyIfConversion())
     return false;
 
-  TII = MF.getSubtarget().getInstrInfo();
-  TRI = MF.getSubtarget().getRegisterInfo();
-  SchedModel =
-    MF.getTarget().getSubtarget<TargetSubtargetInfo>().getSchedModel();
+  TII = STI.getInstrInfo();
+  TRI = STI.getRegisterInfo();
+  SchedModel = STI.getSchedModel();
   MRI = &MF.getRegInfo();
   DomTree = &getAnalysis<MachineDominatorTree>();
   Loops = getAnalysisIfAvailable<MachineLoopInfo>();

diff --git a/lib/CodeGen/ErlangGC.cpp b/lib/CodeGen/ErlangGC.cpp
index 85b0893..024946d 100644
--- a/lib/CodeGen/ErlangGC.cpp
+++ b/lib/CodeGen/ErlangGC.cpp

@@ -27,56 +27,20 @@
 
 namespace {
 
-  class ErlangGC : public GCStrategy {
-    MCSymbol *InsertLabel(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MI,
-                          DebugLoc DL) const;
-  public:
-    ErlangGC();
-    bool findCustomSafePoints(GCFunctionInfo &FI, MachineFunction &MF) override;
-  };
-
+class ErlangGC : public GCStrategy {
+public:
+  ErlangGC();
+};
 }
 
-static GCRegistry::Add<ErlangGC>
-X("erlang", "erlang-compatible garbage collector");
+static GCRegistry::Add<ErlangGC> X("erlang",
+                                   "erlang-compatible garbage collector");
 
-void llvm::linkErlangGC() { }
+void llvm::linkErlangGC() {}
 
 ErlangGC::ErlangGC() {
   InitRoots = false;
   NeededSafePoints = 1 << GC::PostCall;
   UsesMetadata = true;
   CustomRoots = false;
-  CustomSafePoints = true;
-}
-
-MCSymbol *ErlangGC::InsertLabel(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI,
-                                DebugLoc DL) const {
-  const TargetInstrInfo *TII = MBB.getParent()->getSubtarget().getInstrInfo();
-  MCSymbol *Label = MBB.getParent()->getContext().CreateTempSymbol();
-  BuildMI(MBB, MI, DL, TII->get(TargetOpcode::GC_LABEL)).addSym(Label);
-  return Label;
-}
-
-bool ErlangGC::findCustomSafePoints(GCFunctionInfo &FI, MachineFunction &MF) {
-  for (MachineFunction::iterator BBI = MF.begin(), BBE = MF.end(); BBI != BBE;
-       ++BBI)
-    for (MachineBasicBlock::iterator MI = BBI->begin(), ME = BBI->end();
-         MI != ME; ++MI)
-
-      if (MI->getDesc().isCall()) {
-
-        // Do not treat tail call sites as safe points.
-        if (MI->getDesc().isTerminator())
-          continue;
-
-        /* Code copied from VisitCallPoint(...) */
-        MachineBasicBlock::iterator RAI = MI; ++RAI;
-        MCSymbol* Label = InsertLabel(*MI->getParent(), RAI, MI->getDebugLoc());
-        FI.addSafePoint(GC::PostCall, Label, MI->getDebugLoc());
-      }
-
-  return false;
 }

diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index 3680498..b3a22c8 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp

@@ -22,6 +22,7 @@
 
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -74,6 +75,9 @@
 
   // Is domain available?
   bool hasDomain(unsigned domain) const {
+    assert(domain <
+               static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
+           "undefined behavior");
     return AvailableDomains & (1u << domain);
   }
 
@@ -133,7 +137,7 @@
   MachineFunction *MF;
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
-  std::vector<int> AliasMap;
+  std::vector<SmallVector<int, 1>> AliasMap;
   const unsigned NumRegs;
   LiveReg *LiveRegs;
   typedef DenseMap<MachineBasicBlock*, LiveReg*> LiveOutMap;
@@ -169,8 +173,8 @@
   }
 
 private:
-  // Register mapping.
-  int regIndex(unsigned Reg);
+  iterator_range<SmallVectorImpl<int>::const_iterator>
+  regIndizes(unsigned Reg) const;
 
   // DomainValue allocation.
   DomainValue *alloc(int domain = -1);
@@ -201,11 +205,13 @@
 
 char ExeDepsFix::ID = 0;
 
-/// Translate TRI register number to an index into our smaller tables of
-/// interesting registers. Return -1 for boring registers.
-int ExeDepsFix::regIndex(unsigned Reg) {
+/// Translate TRI register number to a list of indizes into our stmaller tables
+/// of interesting registers.
+iterator_range<SmallVectorImpl<int>::const_iterator>
+ExeDepsFix::regIndizes(unsigned Reg) const {
   assert(Reg < AliasMap.size() && "Invalid register");
-  return AliasMap[Reg];
+  const auto &Entry = AliasMap[Reg];
+  return make_range(Entry.begin(), Entry.end());
 }
 
 DomainValue *ExeDepsFix::alloc(int domain) {
@@ -338,9 +344,11 @@
   // All uses of B are referred to A.
   B->Next = retain(A);
 
-  for (unsigned rx = 0; rx != NumRegs; ++rx)
+  for (unsigned rx = 0; rx != NumRegs; ++rx) {
+    assert(LiveRegs && "no space allocated for live registers");
     if (LiveRegs[rx].Value == B)
       setLiveReg(rx, A);
+  }
   return true;
 }
 
@@ -370,13 +378,12 @@
   if (MBB->pred_empty()) {
     for (MachineBasicBlock::livein_iterator i = MBB->livein_begin(),
          e = MBB->livein_end(); i != e; ++i) {
-      int rx = regIndex(*i);
-      if (rx < 0)
-        continue;
-      // Treat function live-ins as if they were defined just before the first
-      // instruction.  Usually, function arguments are set up immediately
-      // before the call.
-      LiveRegs[rx].Def = -1;
+      for (int rx : regIndizes(*i)) {
+        // Treat function live-ins as if they were defined just before the first
+        // instruction.  Usually, function arguments are set up immediately
+        // before the call.
+        LiveRegs[rx].Def = -1;
+      }
     }
     DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": entry\n");
     return;
@@ -467,26 +474,26 @@
 /// or undef use.
 bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
                                        unsigned Pref) {
-  int rx = regIndex(MI->getOperand(OpIdx).getReg());
-  if (rx < 0)
-    return false;
+  unsigned reg = MI->getOperand(OpIdx).getReg();
+  for (int rx : regIndizes(reg)) {
+    unsigned Clearance = CurInstr - LiveRegs[rx].Def;
+    DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
 
-  unsigned Clearance = CurInstr - LiveRegs[rx].Def;
-  DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
-
-  if (Pref > Clearance) {
-    DEBUG(dbgs() << ": Break dependency.\n");
-    return true;
-  }
-  // The current clearance seems OK, but we may be ignoring a def from a
-  // back-edge.
-  if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
-    DEBUG(dbgs() << ": OK .\n");
+    if (Pref > Clearance) {
+      DEBUG(dbgs() << ": Break dependency.\n");
+      continue;
+    }
+    // The current clearance seems OK, but we may be ignoring a def from a
+    // back-edge.
+    if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
+      DEBUG(dbgs() << ": OK .\n");
+      return false;
+    }
+    // A def from an unprocessed back-edge may make us break this dependency.
+    DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
     return false;
   }
-  // A def from an unprocessed back-edge may make us break this dependency.
-  DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
-  return false;
+  return true;
 }
 
 // Update def-ages for registers defined by MI.
@@ -514,26 +521,24 @@
       break;
     if (MO.isUse())
       continue;
-    int rx = regIndex(MO.getReg());
-    if (rx < 0)
-      continue;
+    for (int rx : regIndizes(MO.getReg())) {
+      // This instruction explicitly defines rx.
+      DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
+                   << '\t' << *MI);
 
-    // This instruction explicitly defines rx.
-    DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
-                 << '\t' << *MI);
+      // Check clearance before partial register updates.
+      // Call breakDependence before setting LiveRegs[rx].Def.
+      unsigned Pref = TII->getPartialRegUpdateClearance(MI, i, TRI);
+      if (Pref && shouldBreakDependence(MI, i, Pref))
+        TII->breakPartialRegDependency(MI, i, TRI);
 
-    // Check clearance before partial register updates.
-    // Call breakDependence before setting LiveRegs[rx].Def.
-    unsigned Pref = TII->getPartialRegUpdateClearance(MI, i, TRI);
-    if (Pref && shouldBreakDependence(MI, i, Pref))
-      TII->breakPartialRegDependency(MI, i, TRI);
+      // How many instructions since rx was last written?
+      LiveRegs[rx].Def = CurInstr;
 
-    // How many instructions since rx was last written?
-    LiveRegs[rx].Def = CurInstr;
-
-    // Kill off domains redefined by generic instructions.
-    if (Kill)
-      kill(rx);
+      // Kill off domains redefined by generic instructions.
+      if (Kill)
+        kill(rx);
+    }
   }
   ++CurInstr;
 }
@@ -582,19 +587,19 @@
                 e = mi->getDesc().getNumOperands(); i != e; ++i) {
     MachineOperand &mo = mi->getOperand(i);
     if (!mo.isReg()) continue;
-    int rx = regIndex(mo.getReg());
-    if (rx < 0) continue;
-    force(rx, domain);
+    for (int rx : regIndizes(mo.getReg())) {
+      force(rx, domain);
+    }
   }
 
   // Kill all defs and force them.
   for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) {
     MachineOperand &mo = mi->getOperand(i);
     if (!mo.isReg()) continue;
-    int rx = regIndex(mo.getReg());
-    if (rx < 0) continue;
-    kill(rx);
-    force(rx, domain);
+    for (int rx : regIndizes(mo.getReg())) {
+      kill(rx);
+      force(rx, domain);
+    }
   }
 }
 
@@ -611,9 +616,10 @@
                   e = mi->getDesc().getNumOperands(); i != e; ++i) {
       MachineOperand &mo = mi->getOperand(i);
       if (!mo.isReg()) continue;
-      int rx = regIndex(mo.getReg());
-      if (rx < 0) continue;
-      if (DomainValue *dv = LiveRegs[rx].Value) {
+      for (int rx : regIndizes(mo.getReg())) {
+        DomainValue *dv = LiveRegs[rx].Value;
+        if (dv == nullptr)
+          continue;
         // Bitmask of domains that dv and available have in common.
         unsigned common = dv->getCommonDomains(available);
         // Is it possible to use this collapsed register for free?
@@ -645,6 +651,7 @@
   SmallVector<LiveReg, 4> Regs;
   for (SmallVectorImpl<int>::iterator i=used.begin(), e=used.end(); i!=e; ++i) {
     int rx = *i;
+    assert(LiveRegs && "no space allocated for live registers");
     const LiveReg &LR = LiveRegs[rx];
     // This useless DomainValue could have been missed above.
     if (!LR.Value->getCommonDomains(available)) {
@@ -684,9 +691,11 @@
       continue;
 
     // If latest didn't merge, it is useless now. Kill all registers using it.
-    for (SmallVectorImpl<int>::iterator i=used.begin(), e=used.end(); i!=e; ++i)
-      if (LiveRegs[*i].Value == Latest)
-        kill(*i);
+    for (int i : used) {
+      assert(LiveRegs && "no space allocated for live registers");
+      if (LiveRegs[i].Value == Latest)
+        kill(i);
+    }
   }
 
   // dv is the DomainValue we are going to use for this instruction.
@@ -703,11 +712,11 @@
                                   ii != ee; ++ii) {
     MachineOperand &mo = *ii;
     if (!mo.isReg()) continue;
-    int rx = regIndex(mo.getReg());
-    if (rx < 0) continue;
-    if (!LiveRegs[rx].Value || (mo.isDef() && LiveRegs[rx].Value != dv)) {
-      kill(rx);
-      setLiveReg(rx, dv);
+    for (int rx : regIndizes(mo.getReg())) {
+      if (!LiveRegs[rx].Value || (mo.isDef() && LiveRegs[rx].Value != dv)) {
+        kill(rx);
+        setLiveReg(rx, dv);
+      }
     }
   }
 }
@@ -735,13 +744,13 @@
 
   // Initialize the AliasMap on the first use.
   if (AliasMap.empty()) {
-    // Given a PhysReg, AliasMap[PhysReg] is either the relevant index into RC,
-    // or -1.
-    AliasMap.resize(TRI->getNumRegs(), -1);
+    // Given a PhysReg, AliasMap[PhysReg] returns a list of indices into RC and
+    // therefore the LiveRegs array.
+    AliasMap.resize(TRI->getNumRegs());
     for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i)
       for (MCRegAliasIterator AI(RC->getRegister(i), TRI, true);
            AI.isValid(); ++AI)
-        AliasMap[*AI] = i;
+        AliasMap[*AI].push_back(i);
   }
 
   MachineBasicBlock *Entry = MF->begin();

diff --git a/lib/CodeGen/ForwardControlFlowIntegrity.cpp b/lib/CodeGen/ForwardControlFlowIntegrity.cpp
index 5e7e853..63c3699 100644
--- a/lib/CodeGen/ForwardControlFlowIntegrity.cpp
+++ b/lib/CodeGen/ForwardControlFlowIntegrity.cpp

@@ -25,9 +25,9 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"

diff --git a/lib/CodeGen/GCMetadata.cpp b/lib/CodeGen/GCMetadata.cpp
index ed40982..a2c5fce 100644
--- a/lib/CodeGen/GCMetadata.cpp
+++ b/lib/CodeGen/GCMetadata.cpp

@@ -24,22 +24,20 @@
 using namespace llvm;
 
 namespace {
-  
-  class Printer : public FunctionPass {
-    static char ID;
-    raw_ostream &OS;
-    
-  public:
-    explicit Printer(raw_ostream &OS) : FunctionPass(ID), OS(OS) {}
 
+class Printer : public FunctionPass {
+  static char ID;
+  raw_ostream &OS;
 
-    const char *getPassName() const override;
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
+public:
+  explicit Printer(raw_ostream &OS) : FunctionPass(ID), OS(OS) {}
 
-    bool runOnFunction(Function &F) override;
-    bool doFinalization(Module &M) override;
-  };
+  const char *getPassName() const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
 
+  bool runOnFunction(Function &F) override;
+  bool doFinalization(Module &M) override;
+};
 }
 
 INITIALIZE_PASS(GCModuleInfo, "collector-metadata",
@@ -48,7 +46,7 @@
 // -----------------------------------------------------------------------------
 
 GCFunctionInfo::GCFunctionInfo(const Function &F, GCStrategy &S)
-  : F(F), S(S), FrameSize(~0LL) {}
+    : F(F), S(S), FrameSize(~0LL) {}
 
 GCFunctionInfo::~GCFunctionInfo() {}
 
@@ -56,51 +54,29 @@
 
 char GCModuleInfo::ID = 0;
 
-GCModuleInfo::GCModuleInfo()
-    : ImmutablePass(ID) {
+GCModuleInfo::GCModuleInfo() : ImmutablePass(ID) {
   initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
 }
 
-GCStrategy *GCModuleInfo::getOrCreateStrategy(const Module *M,
-                                              const std::string &Name) {
-  strategy_map_type::iterator NMI = StrategyMap.find(Name);
-  if (NMI != StrategyMap.end())
-    return NMI->getValue();
-  
-  for (GCRegistry::iterator I = GCRegistry::begin(),
-                            E = GCRegistry::end(); I != E; ++I) {
-    if (Name == I->getName()) {
-      std::unique_ptr<GCStrategy> S = I->instantiate();
-      S->M = M;
-      S->Name = Name;
-      StrategyMap[Name] = S.get();
-      StrategyList.push_back(std::move(S));
-      return StrategyList.back().get();
-    }
-  }
- 
-  dbgs() << "unsupported GC: " << Name << "\n";
-  llvm_unreachable(nullptr);
-}
-
 GCFunctionInfo &GCModuleInfo::getFunctionInfo(const Function &F) {
   assert(!F.isDeclaration() && "Can only get GCFunctionInfo for a definition!");
   assert(F.hasGC());
-  
+
   finfo_map_type::iterator I = FInfoMap.find(&F);
   if (I != FInfoMap.end())
     return *I->second;
-  
-  GCStrategy *S = getOrCreateStrategy(F.getParent(), F.getGC());
-  GCFunctionInfo *GFI = S->insertFunctionInfo(F);
+
+  GCStrategy *S = getGCStrategy(F.getGC());
+  Functions.push_back(make_unique<GCFunctionInfo>(F, *S));
+  GCFunctionInfo *GFI = Functions.back().get();
   FInfoMap[&F] = GFI;
   return *GFI;
 }
 
 void GCModuleInfo::clear() {
+  Functions.clear();
   FInfoMap.clear();
-  StrategyMap.clear();
-  StrategyList.clear();
+  GCStrategyList.clear();
 }
 
 // -----------------------------------------------------------------------------
@@ -111,7 +87,6 @@
   return new Printer(OS);
 }
 
-
 const char *Printer::getPassName() const {
   return "Print Garbage Collector Information";
 }
@@ -124,42 +99,49 @@
 
 static const char *DescKind(GC::PointKind Kind) {
   switch (Kind) {
-    case GC::Loop:     return "loop";
-    case GC::Return:   return "return";
-    case GC::PreCall:  return "pre-call";
-    case GC::PostCall: return "post-call";
+  case GC::Loop:
+    return "loop";
+  case GC::Return:
+    return "return";
+  case GC::PreCall:
+    return "pre-call";
+  case GC::PostCall:
+    return "post-call";
   }
   llvm_unreachable("Invalid point kind");
 }
 
 bool Printer::runOnFunction(Function &F) {
-  if (F.hasGC()) return false;
-  
+  if (F.hasGC())
+    return false;
+
   GCFunctionInfo *FD = &getAnalysis<GCModuleInfo>().getFunctionInfo(F);
-  
+
   OS << "GC roots for " << FD->getFunction().getName() << ":\n";
   for (GCFunctionInfo::roots_iterator RI = FD->roots_begin(),
-                                      RE = FD->roots_end(); RI != RE; ++RI)
+                                      RE = FD->roots_end();
+       RI != RE; ++RI)
     OS << "\t" << RI->Num << "\t" << RI->StackOffset << "[sp]\n";
-  
+
   OS << "GC safe points for " << FD->getFunction().getName() << ":\n";
-  for (GCFunctionInfo::iterator PI = FD->begin(),
-                                PE = FD->end(); PI != PE; ++PI) {
-    
-    OS << "\t" << PI->Label->getName() << ": "
-       << DescKind(PI->Kind) << ", live = {";
-    
+  for (GCFunctionInfo::iterator PI = FD->begin(), PE = FD->end(); PI != PE;
+       ++PI) {
+
+    OS << "\t" << PI->Label->getName() << ": " << DescKind(PI->Kind)
+       << ", live = {";
+
     for (GCFunctionInfo::live_iterator RI = FD->live_begin(PI),
-                                       RE = FD->live_end(PI);;) {
+                                       RE = FD->live_end(PI);
+         ;) {
       OS << " " << RI->Num;
       if (++RI == RE)
         break;
       OS << ",";
     }
-    
+
     OS << " }\n";
   }
-  
+
   return false;
 }
 
@@ -169,3 +151,23 @@
   GMI->clear();
   return false;
 }
+
+
+GCStrategy *GCModuleInfo::getGCStrategy(const StringRef Name) {
+  // TODO: Arguably, just doing a linear search would be faster for small N
+  auto NMI = GCStrategyMap.find(Name);
+  if (NMI != GCStrategyMap.end())
+    return NMI->getValue();
+  
+  for (auto& Entry : GCRegistry::entries()) {
+    if (Name == Entry.getName()) {
+      std::unique_ptr<GCStrategy> S = Entry.instantiate();
+      S->Name = Name;
+      GCStrategyMap[Name] = S.get();
+      GCStrategyList.push_back(std::move(S));
+      return GCStrategyList.back().get();
+    }
+  }
+
+  report_fatal_error(std::string("unsupported GC: ") + Name);
+}

diff --git a/lib/CodeGen/GCMetadataPrinter.cpp b/lib/CodeGen/GCMetadataPrinter.cpp
index f80e9ce..bb8cfa1 100644
--- a/lib/CodeGen/GCMetadataPrinter.cpp
+++ b/lib/CodeGen/GCMetadataPrinter.cpp

@@ -14,14 +14,6 @@
 #include "llvm/CodeGen/GCMetadataPrinter.h"
 using namespace llvm;
 
-GCMetadataPrinter::GCMetadataPrinter() { }
+GCMetadataPrinter::GCMetadataPrinter() {}
 
-GCMetadataPrinter::~GCMetadataPrinter() { }
-
-void GCMetadataPrinter::beginAssembly(AsmPrinter &AP) {
-  // Default is no action.
-}
-
-void GCMetadataPrinter::finishAssembly(AsmPrinter &AP) {
-  // Default is no action.
-}
+GCMetadataPrinter::~GCMetadataPrinter() {}

diff --git a/lib/CodeGen/GCRootLowering.cpp b/lib/CodeGen/GCRootLowering.cpp
new file mode 100644
index 0000000..9d38e4c
--- /dev/null
+++ b/lib/CodeGen/GCRootLowering.cpp

@@ -0,0 +1,351 @@
+//===-- GCRootLowering.cpp - Garbage collection infrastructure ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the lowering for the gc.root mechanism.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+/// LowerIntrinsics - This pass rewrites calls to the llvm.gcread or
+/// llvm.gcwrite intrinsics, replacing them with simple loads and stores as
+/// directed by the GCStrategy. It also performs automatic root initialization
+/// and custom intrinsic lowering.
+class LowerIntrinsics : public FunctionPass {
+  bool PerformDefaultLowering(Function &F, GCStrategy &Coll);
+
+public:
+  static char ID;
+
+  LowerIntrinsics();
+  const char *getPassName() const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+};
+
+/// GCMachineCodeAnalysis - This is a target-independent pass over the machine
+/// function representation to identify safe points for the garbage collector
+/// in the machine code. It inserts labels at safe points and populates a
+/// GCMetadata record for each function.
+class GCMachineCodeAnalysis : public MachineFunctionPass {
+  GCFunctionInfo *FI;
+  MachineModuleInfo *MMI;
+  const TargetInstrInfo *TII;
+
+  void FindSafePoints(MachineFunction &MF);
+  void VisitCallPoint(MachineBasicBlock::iterator MI);
+  MCSymbol *InsertLabel(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                        DebugLoc DL) const;
+
+  void FindStackOffsets(MachineFunction &MF);
+
+public:
+  static char ID;
+
+  GCMachineCodeAnalysis();
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+}
+
+// -----------------------------------------------------------------------------
+
+INITIALIZE_PASS_BEGIN(LowerIntrinsics, "gc-lowering", "GC Lowering", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(GCModuleInfo)
+INITIALIZE_PASS_END(LowerIntrinsics, "gc-lowering", "GC Lowering", false, false)
+
+FunctionPass *llvm::createGCLoweringPass() { return new LowerIntrinsics(); }
+
+char LowerIntrinsics::ID = 0;
+
+LowerIntrinsics::LowerIntrinsics() : FunctionPass(ID) {
+  initializeLowerIntrinsicsPass(*PassRegistry::getPassRegistry());
+}
+
+const char *LowerIntrinsics::getPassName() const {
+  return "Lower Garbage Collection Instructions";
+}
+
+void LowerIntrinsics::getAnalysisUsage(AnalysisUsage &AU) const {
+  FunctionPass::getAnalysisUsage(AU);
+  AU.addRequired<GCModuleInfo>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+}
+
+static bool NeedsDefaultLoweringPass(const GCStrategy &C) {
+  // Default lowering is necessary only if read or write barriers have a default
+  // action. The default for roots is no action.
+  return !C.customWriteBarrier() || !C.customReadBarrier() ||
+         C.initializeRoots();
+}
+
+/// doInitialization - If this module uses the GC intrinsics, find them now.
+bool LowerIntrinsics::doInitialization(Module &M) {
+  GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(MI && "LowerIntrinsics didn't require GCModuleInfo!?");
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (!I->isDeclaration() && I->hasGC())
+      MI->getFunctionInfo(*I); // Instantiate the GC strategy.
+
+  return false;
+}
+
+/// CouldBecomeSafePoint - Predicate to conservatively determine whether the
+/// instruction could introduce a safe point.
+static bool CouldBecomeSafePoint(Instruction *I) {
+  // The natural definition of instructions which could introduce safe points
+  // are:
+  //
+  //   - call, invoke (AfterCall, BeforeCall)
+  //   - phis (Loops)
+  //   - invoke, ret, unwind (Exit)
+  //
+  // However, instructions as seemingly inoccuous as arithmetic can become
+  // libcalls upon lowering (e.g., div i64 on a 32-bit platform), so instead
+  // it is necessary to take a conservative approach.
+
+  if (isa<AllocaInst>(I) || isa<GetElementPtrInst>(I) || isa<StoreInst>(I) ||
+      isa<LoadInst>(I))
+    return false;
+
+  // llvm.gcroot is safe because it doesn't do anything at runtime.
+  if (CallInst *CI = dyn_cast<CallInst>(I))
+    if (Function *F = CI->getCalledFunction())
+      if (unsigned IID = F->getIntrinsicID())
+        if (IID == Intrinsic::gcroot)
+          return false;
+
+  return true;
+}
+
+static bool InsertRootInitializers(Function &F, AllocaInst **Roots,
+                                   unsigned Count) {
+  // Scroll past alloca instructions.
+  BasicBlock::iterator IP = F.getEntryBlock().begin();
+  while (isa<AllocaInst>(IP))
+    ++IP;
+
+  // Search for initializers in the initial BB.
+  SmallPtrSet<AllocaInst *, 16> InitedRoots;
+  for (; !CouldBecomeSafePoint(IP); ++IP)
+    if (StoreInst *SI = dyn_cast<StoreInst>(IP))
+      if (AllocaInst *AI =
+              dyn_cast<AllocaInst>(SI->getOperand(1)->stripPointerCasts()))
+        InitedRoots.insert(AI);
+
+  // Add root initializers.
+  bool MadeChange = false;
+
+  for (AllocaInst **I = Roots, **E = Roots + Count; I != E; ++I)
+    if (!InitedRoots.count(*I)) {
+      StoreInst *SI = new StoreInst(
+          ConstantPointerNull::get(cast<PointerType>(
+              cast<PointerType>((*I)->getType())->getElementType())),
+          *I);
+      SI->insertAfter(*I);
+      MadeChange = true;
+    }
+
+  return MadeChange;
+}
+
+/// runOnFunction - Replace gcread/gcwrite intrinsics with loads and stores.
+/// Leave gcroot intrinsics; the code generator needs to see those.
+bool LowerIntrinsics::runOnFunction(Function &F) {
+  // Quick exit for functions that do not use GC.
+  if (!F.hasGC())
+    return false;
+
+  GCFunctionInfo &FI = getAnalysis<GCModuleInfo>().getFunctionInfo(F);
+  GCStrategy &S = FI.getStrategy();
+
+  bool MadeChange = false;
+
+  if (NeedsDefaultLoweringPass(S))
+    MadeChange |= PerformDefaultLowering(F, S);
+
+  return MadeChange;
+}
+
+bool LowerIntrinsics::PerformDefaultLowering(Function &F, GCStrategy &S) {
+  bool LowerWr = !S.customWriteBarrier();
+  bool LowerRd = !S.customReadBarrier();
+  bool InitRoots = S.initializeRoots();
+
+  SmallVector<AllocaInst *, 32> Roots;
+
+  bool MadeChange = false;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
+      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++)) {
+        Function *F = CI->getCalledFunction();
+        switch (F->getIntrinsicID()) {
+        case Intrinsic::gcwrite:
+          if (LowerWr) {
+            // Replace a write barrier with a simple store.
+            Value *St =
+                new StoreInst(CI->getArgOperand(0), CI->getArgOperand(2), CI);
+            CI->replaceAllUsesWith(St);
+            CI->eraseFromParent();
+          }
+          break;
+        case Intrinsic::gcread:
+          if (LowerRd) {
+            // Replace a read barrier with a simple load.
+            Value *Ld = new LoadInst(CI->getArgOperand(1), "", CI);
+            Ld->takeName(CI);
+            CI->replaceAllUsesWith(Ld);
+            CI->eraseFromParent();
+          }
+          break;
+        case Intrinsic::gcroot:
+          if (InitRoots) {
+            // Initialize the GC root, but do not delete the intrinsic. The
+            // backend needs the intrinsic to flag the stack slot.
+            Roots.push_back(
+                cast<AllocaInst>(CI->getArgOperand(0)->stripPointerCasts()));
+          }
+          break;
+        default:
+          continue;
+        }
+
+        MadeChange = true;
+      }
+    }
+  }
+
+  if (Roots.size())
+    MadeChange |= InsertRootInitializers(F, Roots.begin(), Roots.size());
+
+  return MadeChange;
+}
+
+// -----------------------------------------------------------------------------
+
+char GCMachineCodeAnalysis::ID = 0;
+char &llvm::GCMachineCodeAnalysisID = GCMachineCodeAnalysis::ID;
+
+INITIALIZE_PASS(GCMachineCodeAnalysis, "gc-analysis",
+                "Analyze Machine Code For Garbage Collection", false, false)
+
+GCMachineCodeAnalysis::GCMachineCodeAnalysis() : MachineFunctionPass(ID) {}
+
+void GCMachineCodeAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  MachineFunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+  AU.addRequired<MachineModuleInfo>();
+  AU.addRequired<GCModuleInfo>();
+}
+
+MCSymbol *GCMachineCodeAnalysis::InsertLabel(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MI,
+                                             DebugLoc DL) const {
+  MCSymbol *Label = MBB.getParent()->getContext().CreateTempSymbol();
+  BuildMI(MBB, MI, DL, TII->get(TargetOpcode::GC_LABEL)).addSym(Label);
+  return Label;
+}
+
+void GCMachineCodeAnalysis::VisitCallPoint(MachineBasicBlock::iterator CI) {
+  // Find the return address (next instruction), too, so as to bracket the call
+  // instruction.
+  MachineBasicBlock::iterator RAI = CI;
+  ++RAI;
+
+  if (FI->getStrategy().needsSafePoint(GC::PreCall)) {
+    MCSymbol *Label = InsertLabel(*CI->getParent(), CI, CI->getDebugLoc());
+    FI->addSafePoint(GC::PreCall, Label, CI->getDebugLoc());
+  }
+
+  if (FI->getStrategy().needsSafePoint(GC::PostCall)) {
+    MCSymbol *Label = InsertLabel(*CI->getParent(), RAI, CI->getDebugLoc());
+    FI->addSafePoint(GC::PostCall, Label, CI->getDebugLoc());
+  }
+}
+
+void GCMachineCodeAnalysis::FindSafePoints(MachineFunction &MF) {
+  for (MachineFunction::iterator BBI = MF.begin(), BBE = MF.end(); BBI != BBE;
+       ++BBI)
+    for (MachineBasicBlock::iterator MI = BBI->begin(), ME = BBI->end();
+         MI != ME; ++MI)
+      if (MI->isCall()) {
+        // Do not treat tail or sibling call sites as safe points.  This is
+        // legal since any arguments passed to the callee which live in the
+        // remnants of the callers frame will be owned and updated by the
+        // callee if required.
+        if (MI->isTerminator())
+          continue;
+        VisitCallPoint(MI);
+      }
+}
+
+void GCMachineCodeAnalysis::FindStackOffsets(MachineFunction &MF) {
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  assert(TFI && "TargetRegisterInfo not available!");
+
+  for (GCFunctionInfo::roots_iterator RI = FI->roots_begin();
+       RI != FI->roots_end();) {
+    // If the root references a dead object, no need to keep it.
+    if (MF.getFrameInfo()->isDeadObjectIndex(RI->Num)) {
+      RI = FI->removeStackRoot(RI);
+    } else {
+      RI->StackOffset = TFI->getFrameIndexOffset(MF, RI->Num);
+      ++RI;
+    }
+  }
+}
+
+bool GCMachineCodeAnalysis::runOnMachineFunction(MachineFunction &MF) {
+  // Quick exit for functions that do not use GC.
+  if (!MF.getFunction()->hasGC())
+    return false;
+
+  FI = &getAnalysis<GCModuleInfo>().getFunctionInfo(*MF.getFunction());
+  if (!FI->getStrategy().needsSafePoints())
+    return false;
+
+  MMI = &getAnalysis<MachineModuleInfo>();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  // Find the size of the stack frame.
+  FI->setFrameSize(MF.getFrameInfo()->getStackSize());
+
+  // Find all safe points.
+  FindSafePoints(MF);
+
+  // Find the stack offsets for all roots.
+  FindStackOffsets(MF);
+
+  return false;
+}

diff --git a/lib/CodeGen/GCStrategy.cpp b/lib/CodeGen/GCStrategy.cpp
index b346657..554d326 100644
--- a/lib/CodeGen/GCStrategy.cpp
+++ b/lib/CodeGen/GCStrategy.cpp

@@ -1,4 +1,4 @@
-//===-- GCStrategy.cpp - Garbage collection infrastructure -----------------===//
+//===-- GCStrategy.cpp - Garbage Collector Description --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,417 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements target- and collector-independent garbage collection
-// infrastructure.
-//
-// GCMachineCodeAnalysis identifies the GC safe points in the machine code.
-// Roots are identified in SelectionDAGISel.
+// This file implements the policy object GCStrategy which describes the
+// behavior of a given garbage collector.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/GCStrategy.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
 
-namespace {
-
-  /// LowerIntrinsics - This pass rewrites calls to the llvm.gcread or
-  /// llvm.gcwrite intrinsics, replacing them with simple loads and stores as
-  /// directed by the GCStrategy. It also performs automatic root initialization
-  /// and custom intrinsic lowering.
-  class LowerIntrinsics : public FunctionPass {
-    static bool NeedsDefaultLoweringPass(const GCStrategy &C);
-    static bool NeedsCustomLoweringPass(const GCStrategy &C);
-    static bool CouldBecomeSafePoint(Instruction *I);
-    bool PerformDefaultLowering(Function &F, GCStrategy &Coll);
-    static bool InsertRootInitializers(Function &F,
-                                       AllocaInst **Roots, unsigned Count);
-
-  public:
-    static char ID;
-
-    LowerIntrinsics();
-    const char *getPassName() const override;
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-    bool doInitialization(Module &M) override;
-    bool runOnFunction(Function &F) override;
-  };
-
-
-  /// GCMachineCodeAnalysis - This is a target-independent pass over the machine
-  /// function representation to identify safe points for the garbage collector
-  /// in the machine code. It inserts labels at safe points and populates a
-  /// GCMetadata record for each function.
-  class GCMachineCodeAnalysis : public MachineFunctionPass {
-    const TargetMachine *TM;
-    GCFunctionInfo *FI;
-    MachineModuleInfo *MMI;
-    const TargetInstrInfo *TII;
-
-    void FindSafePoints(MachineFunction &MF);
-    void VisitCallPoint(MachineBasicBlock::iterator MI);
-    MCSymbol *InsertLabel(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MI,
-                          DebugLoc DL) const;
-
-    void FindStackOffsets(MachineFunction &MF);
-
-  public:
-    static char ID;
-
-    GCMachineCodeAnalysis();
-    void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-    bool runOnMachineFunction(MachineFunction &MF) override;
-  };
-
-}
-
-// -----------------------------------------------------------------------------
-
-GCStrategy::GCStrategy() :
-  NeededSafePoints(0),
-  CustomReadBarriers(false),
-  CustomWriteBarriers(false),
-  CustomRoots(false),
-  CustomSafePoints(false),
-  InitRoots(true),
-  UsesMetadata(false)
-{}
-
-bool GCStrategy::initializeCustomLowering(Module &M) { return false; }
-
-bool GCStrategy::performCustomLowering(Function &F) {
-  dbgs() << "gc " << getName() << " must override performCustomLowering.\n";
-  llvm_unreachable("must override performCustomLowering");
-}
-
-
-bool GCStrategy::findCustomSafePoints(GCFunctionInfo& FI, MachineFunction &F) {
-  dbgs() << "gc " << getName() << " must override findCustomSafePoints.\n";
-  llvm_unreachable(nullptr);
-}
-
-
-GCFunctionInfo *GCStrategy::insertFunctionInfo(const Function &F) {
-  Functions.push_back(make_unique<GCFunctionInfo>(F, *this));
-  return Functions.back().get();
-}
-
-// -----------------------------------------------------------------------------
-
-INITIALIZE_PASS_BEGIN(LowerIntrinsics, "gc-lowering", "GC Lowering",
-                      false, false)
-INITIALIZE_PASS_DEPENDENCY(GCModuleInfo)
-INITIALIZE_PASS_END(LowerIntrinsics, "gc-lowering", "GC Lowering", false, false)
-
-FunctionPass *llvm::createGCLoweringPass() {
-  return new LowerIntrinsics();
-}
-
-char LowerIntrinsics::ID = 0;
-
-LowerIntrinsics::LowerIntrinsics()
-  : FunctionPass(ID) {
-    initializeLowerIntrinsicsPass(*PassRegistry::getPassRegistry());
-  }
-
-const char *LowerIntrinsics::getPassName() const {
-  return "Lower Garbage Collection Instructions";
-}
-
-void LowerIntrinsics::getAnalysisUsage(AnalysisUsage &AU) const {
-  FunctionPass::getAnalysisUsage(AU);
-  AU.addRequired<GCModuleInfo>();
-  AU.addPreserved<DominatorTreeWrapperPass>();
-}
-
-/// doInitialization - If this module uses the GC intrinsics, find them now.
-bool LowerIntrinsics::doInitialization(Module &M) {
-  // FIXME: This is rather antisocial in the context of a JIT since it performs
-  //        work against the entire module. But this cannot be done at
-  //        runFunction time (initializeCustomLowering likely needs to change
-  //        the module).
-  GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
-  assert(MI && "LowerIntrinsics didn't require GCModuleInfo!?");
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
-    if (!I->isDeclaration() && I->hasGC())
-      MI->getFunctionInfo(*I); // Instantiate the GC strategy.
-
-  bool MadeChange = false;
-  for (GCModuleInfo::iterator I = MI->begin(), E = MI->end(); I != E; ++I)
-    if (NeedsCustomLoweringPass(**I))
-      if ((*I)->initializeCustomLowering(M))
-        MadeChange = true;
-
-  return MadeChange;
-}
-
-bool LowerIntrinsics::InsertRootInitializers(Function &F, AllocaInst **Roots,
-                                                          unsigned Count) {
-  // Scroll past alloca instructions.
-  BasicBlock::iterator IP = F.getEntryBlock().begin();
-  while (isa<AllocaInst>(IP)) ++IP;
-
-  // Search for initializers in the initial BB.
-  SmallPtrSet<AllocaInst*,16> InitedRoots;
-  for (; !CouldBecomeSafePoint(IP); ++IP)
-    if (StoreInst *SI = dyn_cast<StoreInst>(IP))
-      if (AllocaInst *AI =
-          dyn_cast<AllocaInst>(SI->getOperand(1)->stripPointerCasts()))
-        InitedRoots.insert(AI);
-
-  // Add root initializers.
-  bool MadeChange = false;
-
-  for (AllocaInst **I = Roots, **E = Roots + Count; I != E; ++I)
-    if (!InitedRoots.count(*I)) {
-      StoreInst* SI = new StoreInst(ConstantPointerNull::get(cast<PointerType>(
-                        cast<PointerType>((*I)->getType())->getElementType())),
-                        *I);
-      SI->insertAfter(*I);
-      MadeChange = true;
-    }
-
-  return MadeChange;
-}
-
-bool LowerIntrinsics::NeedsDefaultLoweringPass(const GCStrategy &C) {
-  // Default lowering is necessary only if read or write barriers have a default
-  // action. The default for roots is no action.
-  return !C.customWriteBarrier()
-      || !C.customReadBarrier()
-      || C.initializeRoots();
-}
-
-bool LowerIntrinsics::NeedsCustomLoweringPass(const GCStrategy &C) {
-  // Custom lowering is only necessary if enabled for some action.
-  return C.customWriteBarrier()
-      || C.customReadBarrier()
-      || C.customRoots();
-}
-
-/// CouldBecomeSafePoint - Predicate to conservatively determine whether the
-/// instruction could introduce a safe point.
-bool LowerIntrinsics::CouldBecomeSafePoint(Instruction *I) {
-  // The natural definition of instructions which could introduce safe points
-  // are:
-  //
-  //   - call, invoke (AfterCall, BeforeCall)
-  //   - phis (Loops)
-  //   - invoke, ret, unwind (Exit)
-  //
-  // However, instructions as seemingly inoccuous as arithmetic can become
-  // libcalls upon lowering (e.g., div i64 on a 32-bit platform), so instead
-  // it is necessary to take a conservative approach.
-
-  if (isa<AllocaInst>(I) || isa<GetElementPtrInst>(I) ||
-      isa<StoreInst>(I) || isa<LoadInst>(I))
-    return false;
-
-  // llvm.gcroot is safe because it doesn't do anything at runtime.
-  if (CallInst *CI = dyn_cast<CallInst>(I))
-    if (Function *F = CI->getCalledFunction())
-      if (unsigned IID = F->getIntrinsicID())
-        if (IID == Intrinsic::gcroot)
-          return false;
-
-  return true;
-}
-
-/// runOnFunction - Replace gcread/gcwrite intrinsics with loads and stores.
-/// Leave gcroot intrinsics; the code generator needs to see those.
-bool LowerIntrinsics::runOnFunction(Function &F) {
-  // Quick exit for functions that do not use GC.
-  if (!F.hasGC())
-    return false;
-
-  GCFunctionInfo &FI = getAnalysis<GCModuleInfo>().getFunctionInfo(F);
-  GCStrategy &S = FI.getStrategy();
-
-  bool MadeChange = false;
-
-  if (NeedsDefaultLoweringPass(S))
-    MadeChange |= PerformDefaultLowering(F, S);
-
-  bool UseCustomLoweringPass = NeedsCustomLoweringPass(S);
-  if (UseCustomLoweringPass)
-    MadeChange |= S.performCustomLowering(F);
-
-  // Custom lowering may modify the CFG, so dominators must be recomputed.
-  if (UseCustomLoweringPass) {
-    if (DominatorTreeWrapperPass *DTWP =
-            getAnalysisIfAvailable<DominatorTreeWrapperPass>())
-      DTWP->getDomTree().recalculate(F);
-  }
-
-  return MadeChange;
-}
-
-bool LowerIntrinsics::PerformDefaultLowering(Function &F, GCStrategy &S) {
-  bool LowerWr = !S.customWriteBarrier();
-  bool LowerRd = !S.customReadBarrier();
-  bool InitRoots = S.initializeRoots();
-
-  SmallVector<AllocaInst*, 32> Roots;
-
-  bool MadeChange = false;
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
-      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++)) {
-        Function *F = CI->getCalledFunction();
-        switch (F->getIntrinsicID()) {
-        case Intrinsic::gcwrite:
-          if (LowerWr) {
-            // Replace a write barrier with a simple store.
-            Value *St = new StoreInst(CI->getArgOperand(0),
-                                      CI->getArgOperand(2), CI);
-            CI->replaceAllUsesWith(St);
-            CI->eraseFromParent();
-          }
-          break;
-        case Intrinsic::gcread:
-          if (LowerRd) {
-            // Replace a read barrier with a simple load.
-            Value *Ld = new LoadInst(CI->getArgOperand(1), "", CI);
-            Ld->takeName(CI);
-            CI->replaceAllUsesWith(Ld);
-            CI->eraseFromParent();
-          }
-          break;
-        case Intrinsic::gcroot:
-          if (InitRoots) {
-            // Initialize the GC root, but do not delete the intrinsic. The
-            // backend needs the intrinsic to flag the stack slot.
-            Roots.push_back(cast<AllocaInst>(
-                              CI->getArgOperand(0)->stripPointerCasts()));
-          }
-          break;
-        default:
-          continue;
-        }
-
-        MadeChange = true;
-      }
-    }
-  }
-
-  if (Roots.size())
-    MadeChange |= InsertRootInitializers(F, Roots.begin(), Roots.size());
-
-  return MadeChange;
-}
-
-// -----------------------------------------------------------------------------
-
-char GCMachineCodeAnalysis::ID = 0;
-char &llvm::GCMachineCodeAnalysisID = GCMachineCodeAnalysis::ID;
-
-INITIALIZE_PASS(GCMachineCodeAnalysis, "gc-analysis",
-                "Analyze Machine Code For Garbage Collection", false, false)
-
-GCMachineCodeAnalysis::GCMachineCodeAnalysis()
-  : MachineFunctionPass(ID) {}
-
-void GCMachineCodeAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
-  MachineFunctionPass::getAnalysisUsage(AU);
-  AU.setPreservesAll();
-  AU.addRequired<MachineModuleInfo>();
-  AU.addRequired<GCModuleInfo>();
-}
-
-MCSymbol *GCMachineCodeAnalysis::InsertLabel(MachineBasicBlock &MBB,
-                                             MachineBasicBlock::iterator MI,
-                                             DebugLoc DL) const {
-  MCSymbol *Label = MBB.getParent()->getContext().CreateTempSymbol();
-  BuildMI(MBB, MI, DL, TII->get(TargetOpcode::GC_LABEL)).addSym(Label);
-  return Label;
-}
-
-void GCMachineCodeAnalysis::VisitCallPoint(MachineBasicBlock::iterator CI) {
-  // Find the return address (next instruction), too, so as to bracket the call
-  // instruction.
-  MachineBasicBlock::iterator RAI = CI;
-  ++RAI;
-
-  if (FI->getStrategy().needsSafePoint(GC::PreCall)) {
-    MCSymbol* Label = InsertLabel(*CI->getParent(), CI, CI->getDebugLoc());
-    FI->addSafePoint(GC::PreCall, Label, CI->getDebugLoc());
-  }
-
-  if (FI->getStrategy().needsSafePoint(GC::PostCall)) {
-    MCSymbol* Label = InsertLabel(*CI->getParent(), RAI, CI->getDebugLoc());
-    FI->addSafePoint(GC::PostCall, Label, CI->getDebugLoc());
-  }
-}
-
-void GCMachineCodeAnalysis::FindSafePoints(MachineFunction &MF) {
-  for (MachineFunction::iterator BBI = MF.begin(),
-                                 BBE = MF.end(); BBI != BBE; ++BBI)
-    for (MachineBasicBlock::iterator MI = BBI->begin(),
-                                     ME = BBI->end(); MI != ME; ++MI)
-      if (MI->isCall())
-        VisitCallPoint(MI);
-}
-
-void GCMachineCodeAnalysis::FindStackOffsets(MachineFunction &MF) {
-  const TargetFrameLowering *TFI = TM->getSubtargetImpl()->getFrameLowering();
-  assert(TFI && "TargetRegisterInfo not available!");
-
-  for (GCFunctionInfo::roots_iterator RI = FI->roots_begin();
-       RI != FI->roots_end();) {
-    // If the root references a dead object, no need to keep it.
-    if (MF.getFrameInfo()->isDeadObjectIndex(RI->Num)) {
-      RI = FI->removeStackRoot(RI);
-    } else {
-      RI->StackOffset = TFI->getFrameIndexOffset(MF, RI->Num);
-      ++RI;
-    }
-  }
-}
-
-bool GCMachineCodeAnalysis::runOnMachineFunction(MachineFunction &MF) {
-  // Quick exit for functions that do not use GC.
-  if (!MF.getFunction()->hasGC())
-    return false;
-
-  FI = &getAnalysis<GCModuleInfo>().getFunctionInfo(*MF.getFunction());
-  if (!FI->getStrategy().needsSafePoints())
-    return false;
-
-  TM = &MF.getTarget();
-  MMI = &getAnalysis<MachineModuleInfo>();
-  TII = TM->getSubtargetImpl()->getInstrInfo();
-
-  // Find the size of the stack frame.
-  FI->setFrameSize(MF.getFrameInfo()->getStackSize());
-
-  // Find all safe points.
-  if (FI->getStrategy().customSafePoints()) {
-    FI->getStrategy().findCustomSafePoints(*FI, MF);
-  } else {
-    FindSafePoints(MF);
-  }
-
-  // Find the stack offsets for all roots.
-  FindStackOffsets(MF);
-
-  return false;
-}
+GCStrategy::GCStrategy()
+    : UseStatepoints(false), NeededSafePoints(0), CustomReadBarriers(false),
+      CustomWriteBarriers(false), CustomRoots(false), InitRoots(true),
+      UsesMetadata(false) {}

diff --git a/lib/CodeGen/GlobalMerge.cpp b/lib/CodeGen/GlobalMerge.cpp
index 457d7d6..4188e5d 100644
--- a/lib/CodeGen/GlobalMerge.cpp
+++ b/lib/CodeGen/GlobalMerge.cpp

@@ -54,6 +54,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -64,7 +65,6 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -90,10 +90,16 @@
      cl::desc("Enable global merge pass on external linkage"),
      cl::init(false));
 
-STATISTIC(NumMerged      , "Number of globals merged");
+STATISTIC(NumMerged, "Number of globals merged");
 namespace {
   class GlobalMerge : public FunctionPass {
     const TargetMachine *TM;
+    const DataLayout *DL;
+    // FIXME: Infer the maximum possible offset depending on the actual users
+    // (these max offsets are different for the users inside Thumb or ARM
+    // functions), see the code that passes in the offset in the ARM backend
+    // for more information.
+    unsigned MaxOffset;
 
     bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                  Module &M, bool isConst, unsigned AddrSpace) const;
@@ -117,8 +123,10 @@
 
   public:
     static char ID;             // Pass identification, replacement for typeid.
-    explicit GlobalMerge(const TargetMachine *TM = nullptr)
-      : FunctionPass(ID), TM(TM) {
+    explicit GlobalMerge(const TargetMachine *TM = nullptr,
+                         unsigned MaximalOffset = 0)
+        : FunctionPass(ID), TM(TM), DL(TM->getDataLayout()),
+          MaxOffset(MaximalOffset) {
       initializeGlobalMergePass(*PassRegistry::getPassRegistry());
     }
 
@@ -138,22 +146,16 @@
 } // end anonymous namespace
 
 char GlobalMerge::ID = 0;
-INITIALIZE_TM_PASS(GlobalMerge, "global-merge", "Merge global variables",
-                   false, false)
+INITIALIZE_PASS_BEGIN(GlobalMerge, "global-merge", "Merge global variables",
+                      false, false)
+INITIALIZE_PASS_END(GlobalMerge, "global-merge", "Merge global variables",
+                    false, false)
 
 bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                           Module &M, bool isConst, unsigned AddrSpace) const {
-  const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
-  const DataLayout *DL = TLI->getDataLayout();
-
-  // FIXME: Infer the maximum possible offset depending on the actual users
-  // (these max offsets are different for the users inside Thumb or ARM
-  // functions)
-  unsigned MaxOffset = TLI->getMaximalGlobalOffset();
-
   // FIXME: Find better heuristics
   std::stable_sort(Globals.begin(), Globals.end(),
-                   [DL](const GlobalVariable *GV1, const GlobalVariable *GV2) {
+                   [this](const GlobalVariable *GV1, const GlobalVariable *GV2) {
     Type *Ty1 = cast<PointerType>(GV1->getType())->getElementType();
     Type *Ty2 = cast<PointerType>(GV2->getType())->getElementType();
 
@@ -282,9 +284,6 @@
 
   DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals,
                                                         BSSGlobals;
-  const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
-  const DataLayout *DL = TLI->getDataLayout();
-  unsigned MaxOffset = TLI->getMaximalGlobalOffset();
   bool Changed = false;
   setMustKeepGlobalVariables(M);
 
@@ -357,6 +356,6 @@
   return false;
 }
 
-Pass *llvm::createGlobalMergePass(const TargetMachine *TM) {
-  return new GlobalMerge(TM);
+Pass *llvm::createGlobalMergePass(const TargetMachine *TM, unsigned Offset) {
+  return new GlobalMerge(TM, Offset);
 }

diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index e84d25d..7a29569 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp

@@ -271,15 +271,13 @@
 INITIALIZE_PASS_END(IfConverter, "if-converter", "If Converter", false, false)
 
 bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
-  TLI = MF.getSubtarget().getTargetLowering();
-  TII = MF.getSubtarget().getInstrInfo();
-  TRI = MF.getSubtarget().getRegisterInfo();
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+  TLI = ST.getTargetLowering();
+  TII = ST.getInstrInfo();
+  TRI = ST.getRegisterInfo();
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   MRI = &MF.getRegInfo();
-
-  const TargetSubtargetInfo &ST =
-    MF.getTarget().getSubtarget<TargetSubtargetInfo>();
   SchedModel.init(ST.getSchedModel(), &ST, TII);
 
   if (!TII) return false;
@@ -290,7 +288,7 @@
   if (!PreRegAlloc) {
     // Tail merge tend to expose more if-conversion opportunities.
     BranchFolder BF(true, false, *MBFI, *MBPI);
-    BFChange = BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
+    BFChange = BF.OptimizeFunction(MF, TII, ST.getRegisterInfo(),
                                    getAnalysisIfAvailable<MachineModuleInfo>());
   }
 

diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 6a6e15d..f0d407f 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp

@@ -508,6 +508,7 @@
   SmallVector<std::pair<unsigned, VNInfo*>, 8> WorkList;
   WorkList.push_back(std::make_pair(UseReg, UseVNI));
 
+  LiveInterval &OrigLI = LIS.getInterval(Original);
   do {
     unsigned Reg;
     VNInfo *VNI;
@@ -521,8 +522,11 @@
 
     // Trace through PHI-defs created by live range splitting.
     if (VNI->isPHIDef()) {
-      // Stop at original PHIs.  We don't know the value at the predecessors.
-      if (VNI->def == OrigVNI->def) {
+      // Stop at original PHIs.  We don't know the value at the
+      // predecessors. Look up the VNInfo for the current definition
+      // in OrigLI, to properly determine whether or not this phi was
+      // added by splitting.
+      if (VNI->def == OrigLI.getVNInfoAt(VNI->def)->def) {
         DEBUG(dbgs() << "orig phi value\n");
         SVI->second.DefByOrigPHI = true;
         SVI->second.AllDefsAreReloads = false;
@@ -542,7 +546,6 @@
       // Separate all values dominated by OrigVNI into PHIs and non-PHIs.
       SmallVector<VNInfo*, 8> PHIs, NonPHIs;
       LiveInterval &LI = LIS.getInterval(Reg);
-      LiveInterval &OrigLI = LIS.getInterval(Original);
 
       for (LiveInterval::vni_iterator VI = LI.vni_begin(), VE = LI.vni_end();
            VI != VE; ++VI) {
@@ -573,8 +576,8 @@
         std::tie(SVI, Inserted) =
           SibValues.insert(std::make_pair(NonPHI, SibValueInfo(Reg, NonPHI)));
         // Add all the PHIs as dependents of NonPHI.
-        for (unsigned pi = 0, pe = PHIs.size(); pi != pe; ++pi)
-          SVI->second.Deps.push_back(PHIs[pi]);
+        SVI->second.Deps.insert(SVI->second.Deps.end(), PHIs.begin(),
+                                PHIs.end());
         // This is the first time we see NonPHI, add it to the worklist.
         if (Inserted)
           WorkList.push_back(std::make_pair(Reg, NonPHI));
@@ -1088,7 +1091,8 @@
   bool WasCopy = MI->isCopy();
   unsigned ImpReg = 0;
 
-  bool SpillSubRegs = (MI->getOpcode() == TargetOpcode::PATCHPOINT ||
+  bool SpillSubRegs = (MI->getOpcode() == TargetOpcode::STATEPOINT ||
+                       MI->getOpcode() == TargetOpcode::PATCHPOINT ||
                        MI->getOpcode() == TargetOpcode::STACKMAP);
 
   // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied
@@ -1138,13 +1142,8 @@
       continue;
     // FoldMI does not define this physreg. Remove the LI segment.
     assert(MO->isDead() && "Cannot fold physreg def");
-    for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units) {
-      if (LiveRange *LR = LIS.getCachedRegUnit(*Units)) {
-        SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
-        if (VNInfo *VNI = LR->getVNInfoAt(Idx))
-          LR->removeValNo(VNI);
-      }
-    }
+    SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
+    LIS.removePhysRegDefAt(Reg, Idx);
   }
 
   LIS.ReplaceMachineInstrInMaps(MI, FoldMI);

diff --git a/lib/CodeGen/JumpInstrTables.cpp b/lib/CodeGen/JumpInstrTables.cpp
index 20f775c..75fa261 100644
--- a/lib/CodeGen/JumpInstrTables.cpp
+++ b/lib/CodeGen/JumpInstrTables.cpp

@@ -13,7 +13,6 @@
 #define DEBUG_TYPE "jt"
 
 #include "llvm/CodeGen/JumpInstrTables.h"
-
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/JumpInstrTableInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -30,7 +29,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-
 #include <vector>
 
 using namespace llvm;
@@ -117,8 +115,8 @@
     if (!isa<GlobalAlias>(C))
       C->replaceUsesOfWithOnConstant(GV, V, U);
   } else {
-    assert(false && "The Use of a Function symbol is neither an instruction nor"
-                    " a constant");
+    llvm_unreachable("The Use of a Function symbol is neither an instruction "
+                     "nor a constant");
   }
 
   return true;

diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 61face2..9c23368 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp

@@ -12,23 +12,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Target/TargetMachine.h"
-
 #include "llvm/Analysis/JumpInstrTableInfo.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
 #include "llvm/CodeGen/ForwardControlFlowIntegrity.h"
 #include "llvm/CodeGen/JumpInstrTables.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
@@ -78,8 +78,10 @@
   CodeGenInfo = T.createMCCodeGenInfo(Triple, RM, CM, OL);
 }
 
-void LLVMTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  PM.add(createBasicTargetTransformInfoPass(this));
+TargetIRAnalysis LLVMTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](Function &F) {
+    return TargetTransformInfo(BasicTTIImpl(this, F));
+  });
 }
 
 /// addPassesToX helper drives creation and initialization of TargetPassConfig.
@@ -90,7 +92,7 @@
                                           AnalysisID StopAfter) {
 
   // Add internal analysis passes from the target machine.
-  TM->addAnalysisPasses(PM);
+  PM.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
 
   // Targets may override createPassConfig to provide a target-specific
   // subclass.
@@ -114,7 +116,7 @@
   // all the per-module stuff we're generating, including MCContext.
   MachineModuleInfo *MMI = new MachineModuleInfo(
       *TM->getMCAsmInfo(), *TM->getSubtargetImpl()->getRegisterInfo(),
-      &TM->getSubtargetImpl()->getTargetLowering()->getObjFileLowering());
+      TM->getObjFileLowering());
   PM.add(MMI);
 
   // Set up a MachineFunction for the rest of CodeGen to work on.
@@ -222,13 +224,11 @@
   }
 
   // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
-  FunctionPass *Printer = getTarget().createAsmPrinter(*this, *AsmStreamer);
+  FunctionPass *Printer =
+      getTarget().createAsmPrinter(*this, std::move(AsmStreamer));
   if (!Printer)
     return true;
 
-  // If successful, createAsmPrinter took ownership of AsmStreamer.
-  AsmStreamer.release();
-
   PM.add(Printer);
 
   return false;
@@ -262,20 +262,16 @@
   if (!MCE || !MAB)
     return true;
 
-  std::unique_ptr<MCStreamer> AsmStreamer;
-  AsmStreamer.reset(getTarget()
-                        .createMCObjectStreamer(getTargetTriple(), *Ctx, *MAB,
-                                                Out, MCE, STI,
-                                                Options.MCOptions.MCRelaxAll));
+  std::unique_ptr<MCStreamer> AsmStreamer(getTarget().createMCObjectStreamer(
+      getTargetTriple(), *Ctx, *MAB, Out, MCE, STI,
+      Options.MCOptions.MCRelaxAll));
 
   // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
-  FunctionPass *Printer = getTarget().createAsmPrinter(*this, *AsmStreamer);
+  FunctionPass *Printer =
+      getTarget().createAsmPrinter(*this, std::move(AsmStreamer));
   if (!Printer)
     return true;
 
-  // If successful, createAsmPrinter took ownership of AsmStreamer.
-  AsmStreamer.release();
-
   PM.add(Printer);
 
   return false; // success!

diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp
index b621e3b..9eaf7da 100644
--- a/lib/CodeGen/LexicalScopes.cpp
+++ b/lib/CodeGen/LexicalScopes.cpp

@@ -104,14 +104,6 @@
   }
 }
 
-LexicalScope *LexicalScopes::findInlinedScope(DebugLoc DL) {
-  MDNode *Scope = nullptr;
-  MDNode *IA = nullptr;
-  DL.getScopeAndInlinedAt(Scope, IA, MF->getFunction()->getContext());
-  auto I = InlinedLexicalScopeMap.find(std::make_pair(Scope, IA));
-  return I != InlinedLexicalScopeMap.end() ? &I->second : nullptr;
-}
-
 /// findLexicalScope - Find lexical scope, either regular or inlined, for the
 /// given DebugLoc. Return NULL if not found.
 LexicalScope *LexicalScopes::findLexicalScope(DebugLoc DL) {
@@ -168,11 +160,10 @@
   LexicalScope *Parent = nullptr;
   if (D.isLexicalBlock())
     Parent = getOrCreateLexicalScope(DebugLoc::getFromDILexicalBlock(Scope));
-  // FIXME: Use forward_as_tuple instead of make_tuple, once MSVC2012
-  // compatibility is no longer required.
-  I = LexicalScopeMap.emplace(std::piecewise_construct, std::make_tuple(Scope),
-                              std::make_tuple(Parent, DIDescriptor(Scope),
-                                              nullptr, false)).first;
+  I = LexicalScopeMap.emplace(std::piecewise_construct,
+                              std::forward_as_tuple(Scope),
+                              std::forward_as_tuple(Parent, DIDescriptor(Scope),
+                                                    nullptr, false)).first;
 
   if (!Parent) {
     assert(DIDescriptor(Scope).isSubprogram());
@@ -199,12 +190,11 @@
   else
     Parent = getOrCreateInlinedScope(Scope.getContext(), InlinedAt);
 
-  // FIXME: Use forward_as_tuple instead of make_tuple, once MSVC2012
-  // compatibility is no longer required.
   I = InlinedLexicalScopeMap.emplace(std::piecewise_construct,
-                                     std::make_tuple(P),
-                                     std::make_tuple(Parent, Scope, InlinedAt,
-                                                     false)).first;
+                                     std::forward_as_tuple(P),
+                                     std::forward_as_tuple(Parent, Scope,
+                                                           InlinedAt, false))
+          .first;
   return &I->second;
 }
 

diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 1624851..dc936a3 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp

@@ -40,7 +40,6 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-
 #include <memory>
 
 using namespace llvm;

diff --git a/lib/CodeGen/LiveDebugVariables.h b/lib/CodeGen/LiveDebugVariables.h
index 7e3b361..9748329 100644
--- a/lib/CodeGen/LiveDebugVariables.h
+++ b/lib/CodeGen/LiveDebugVariables.h

@@ -22,8 +22,8 @@
 #define LLVM_LIB_CODEGEN_LIVEDEBUGVARIABLES_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/IR/DebugInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/IR/DebugInfo.h"
 
 namespace llvm {
 

diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index ddb0032..fd7516d 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp

@@ -26,11 +26,278 @@
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <algorithm>
 using namespace llvm;
 
+//===----------------------------------------------------------------------===//
+// Implementation of various methods necessary for calculation of live ranges.
+// The implementation of the methods abstracts from the concrete type of the
+// segment collection.
+//
+// Implementation of the class follows the Template design pattern. The base
+// class contains generic algorithms that call collection-specific methods,
+// which are provided in concrete subclasses. In order to avoid virtual calls
+// these methods are provided by means of C++ template instantiation.
+// The base class calls the methods of the subclass through method impl(),
+// which casts 'this' pointer to the type of the subclass.
+//
+//===----------------------------------------------------------------------===//
+
+template <typename ImplT, typename IteratorT, typename CollectionT>
+class CalcLiveRangeUtilBase {
+protected:
+  LiveRange *LR;
+
+protected:
+  CalcLiveRangeUtilBase(LiveRange *LR) : LR(LR) {}
+
+public:
+  typedef LiveRange::Segment Segment;
+  typedef IteratorT iterator;
+
+  VNInfo *createDeadDef(SlotIndex Def, VNInfo::Allocator &VNInfoAllocator) {
+    assert(!Def.isDead() && "Cannot define a value at the dead slot");
+
+    iterator I = impl().find(Def);
+    if (I == segments().end()) {
+      VNInfo *VNI = LR->getNextValue(Def, VNInfoAllocator);
+      impl().insertAtEnd(Segment(Def, Def.getDeadSlot(), VNI));
+      return VNI;
+    }
+
+    Segment *S = segmentAt(I);
+    if (SlotIndex::isSameInstr(Def, S->start)) {
+      assert(S->valno->def == S->start && "Inconsistent existing value def");
+
+      // It is possible to have both normal and early-clobber defs of the same
+      // register on an instruction. It doesn't make a lot of sense, but it is
+      // possible to specify in inline assembly.
+      //
+      // Just convert everything to early-clobber.
+      Def = std::min(Def, S->start);
+      if (Def != S->start)
+        S->start = S->valno->def = Def;
+      return S->valno;
+    }
+    assert(SlotIndex::isEarlierInstr(Def, S->start) && "Already live at def");
+    VNInfo *VNI = LR->getNextValue(Def, VNInfoAllocator);
+    segments().insert(I, Segment(Def, Def.getDeadSlot(), VNI));
+    return VNI;
+  }
+
+  VNInfo *extendInBlock(SlotIndex StartIdx, SlotIndex Use) {
+    if (segments().empty())
+      return nullptr;
+    iterator I =
+        impl().findInsertPos(Segment(Use.getPrevSlot(), Use, nullptr));
+    if (I == segments().begin())
+      return nullptr;
+    --I;
+    if (I->end <= StartIdx)
+      return nullptr;
+    if (I->end < Use)
+      extendSegmentEndTo(I, Use);
+    return I->valno;
+  }
+
+  /// This method is used when we want to extend the segment specified
+  /// by I to end at the specified endpoint. To do this, we should
+  /// merge and eliminate all segments that this will overlap
+  /// with. The iterator is not invalidated.
+  void extendSegmentEndTo(iterator I, SlotIndex NewEnd) {
+    assert(I != segments().end() && "Not a valid segment!");
+    Segment *S = segmentAt(I);
+    VNInfo *ValNo = I->valno;
+
+    // Search for the first segment that we can't merge with.
+    iterator MergeTo = std::next(I);
+    for (; MergeTo != segments().end() && NewEnd >= MergeTo->end; ++MergeTo)
+      assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
+
+    // If NewEnd was in the middle of a segment, make sure to get its endpoint.
+    S->end = std::max(NewEnd, std::prev(MergeTo)->end);
+
+    // If the newly formed segment now touches the segment after it and if they
+    // have the same value number, merge the two segments into one segment.
+    if (MergeTo != segments().end() && MergeTo->start <= I->end &&
+        MergeTo->valno == ValNo) {
+      S->end = MergeTo->end;
+      ++MergeTo;
+    }
+
+    // Erase any dead segments.
+    segments().erase(std::next(I), MergeTo);
+  }
+
+  /// This method is used when we want to extend the segment specified
+  /// by I to start at the specified endpoint.  To do this, we should
+  /// merge and eliminate all segments that this will overlap with.
+  iterator extendSegmentStartTo(iterator I, SlotIndex NewStart) {
+    assert(I != segments().end() && "Not a valid segment!");
+    Segment *S = segmentAt(I);
+    VNInfo *ValNo = I->valno;
+
+    // Search for the first segment that we can't merge with.
+    iterator MergeTo = I;
+    do {
+      if (MergeTo == segments().begin()) {
+        S->start = NewStart;
+        segments().erase(MergeTo, I);
+        return I;
+      }
+      assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
+      --MergeTo;
+    } while (NewStart <= MergeTo->start);
+
+    // If we start in the middle of another segment, just delete a range and
+    // extend that segment.
+    if (MergeTo->end >= NewStart && MergeTo->valno == ValNo) {
+      segmentAt(MergeTo)->end = S->end;
+    } else {
+      // Otherwise, extend the segment right after.
+      ++MergeTo;
+      Segment *MergeToSeg = segmentAt(MergeTo);
+      MergeToSeg->start = NewStart;
+      MergeToSeg->end = S->end;
+    }
+
+    segments().erase(std::next(MergeTo), std::next(I));
+    return MergeTo;
+  }
+
+  iterator addSegment(Segment S) {
+    SlotIndex Start = S.start, End = S.end;
+    iterator I = impl().findInsertPos(S);
+
+    // If the inserted segment starts in the middle or right at the end of
+    // another segment, just extend that segment to contain the segment of S.
+    if (I != segments().begin()) {
+      iterator B = std::prev(I);
+      if (S.valno == B->valno) {
+        if (B->start <= Start && B->end >= Start) {
+          extendSegmentEndTo(B, End);
+          return B;
+        }
+      } else {
+        // Check to make sure that we are not overlapping two live segments with
+        // different valno's.
+        assert(B->end <= Start &&
+               "Cannot overlap two segments with differing ValID's"
+               " (did you def the same reg twice in a MachineInstr?)");
+      }
+    }
+
+    // Otherwise, if this segment ends in the middle of, or right next
+    // to, another segment, merge it into that segment.
+    if (I != segments().end()) {
+      if (S.valno == I->valno) {
+        if (I->start <= End) {
+          I = extendSegmentStartTo(I, Start);
+
+          // If S is a complete superset of a segment, we may need to grow its
+          // endpoint as well.
+          if (End > I->end)
+            extendSegmentEndTo(I, End);
+          return I;
+        }
+      } else {
+        // Check to make sure that we are not overlapping two live segments with
+        // different valno's.
+        assert(I->start >= End &&
+               "Cannot overlap two segments with differing ValID's");
+      }
+    }
+
+    // Otherwise, this is just a new segment that doesn't interact with
+    // anything.
+    // Insert it.
+    return segments().insert(I, S);
+  }
+
+private:
+  ImplT &impl() { return *static_cast<ImplT *>(this); }
+
+  CollectionT &segments() { return impl().segmentsColl(); }
+
+  Segment *segmentAt(iterator I) { return const_cast<Segment *>(&(*I)); }
+};
+
+//===----------------------------------------------------------------------===//
+//   Instantiation of the methods for calculation of live ranges
+//   based on a segment vector.
+//===----------------------------------------------------------------------===//
+
+class CalcLiveRangeUtilVector;
+typedef CalcLiveRangeUtilBase<CalcLiveRangeUtilVector, LiveRange::iterator,
+                              LiveRange::Segments> CalcLiveRangeUtilVectorBase;
+
+class CalcLiveRangeUtilVector : public CalcLiveRangeUtilVectorBase {
+public:
+  CalcLiveRangeUtilVector(LiveRange *LR) : CalcLiveRangeUtilVectorBase(LR) {}
+
+private:
+  friend CalcLiveRangeUtilVectorBase;
+
+  LiveRange::Segments &segmentsColl() { return LR->segments; }
+
+  void insertAtEnd(const Segment &S) { LR->segments.push_back(S); }
+
+  iterator find(SlotIndex Pos) { return LR->find(Pos); }
+
+  iterator findInsertPos(Segment S) {
+    return std::upper_bound(LR->begin(), LR->end(), S.start);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+//   Instantiation of the methods for calculation of live ranges
+//   based on a segment set.
+//===----------------------------------------------------------------------===//
+
+class CalcLiveRangeUtilSet;
+typedef CalcLiveRangeUtilBase<CalcLiveRangeUtilSet,
+                              LiveRange::SegmentSet::iterator,
+                              LiveRange::SegmentSet> CalcLiveRangeUtilSetBase;
+
+class CalcLiveRangeUtilSet : public CalcLiveRangeUtilSetBase {
+public:
+  CalcLiveRangeUtilSet(LiveRange *LR) : CalcLiveRangeUtilSetBase(LR) {}
+
+private:
+  friend CalcLiveRangeUtilSetBase;
+
+  LiveRange::SegmentSet &segmentsColl() { return *LR->segmentSet; }
+
+  void insertAtEnd(const Segment &S) {
+    LR->segmentSet->insert(LR->segmentSet->end(), S);
+  }
+
+  iterator find(SlotIndex Pos) {
+    iterator I =
+        LR->segmentSet->upper_bound(Segment(Pos, Pos.getNextSlot(), nullptr));
+    if (I == LR->segmentSet->begin())
+      return I;
+    iterator PrevI = std::prev(I);
+    if (Pos < (*PrevI).end)
+      return PrevI;
+    return I;
+  }
+
+  iterator findInsertPos(Segment S) {
+    iterator I = LR->segmentSet->upper_bound(S);
+    if (I != LR->segmentSet->end() && !(S.start < *I))
+      ++I;
+    return I;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+//   LiveRange methods
+//===----------------------------------------------------------------------===//
+
 LiveRange::iterator LiveRange::find(SlotIndex Pos) {
   // This algorithm is basically std::upper_bound.
   // Unfortunately, std::upper_bound cannot be used with mixed types until we
@@ -51,30 +318,11 @@
 
 VNInfo *LiveRange::createDeadDef(SlotIndex Def,
                                   VNInfo::Allocator &VNInfoAllocator) {
-  assert(!Def.isDead() && "Cannot define a value at the dead slot");
-  iterator I = find(Def);
-  if (I == end()) {
-    VNInfo *VNI = getNextValue(Def, VNInfoAllocator);
-    segments.push_back(Segment(Def, Def.getDeadSlot(), VNI));
-    return VNI;
-  }
-  if (SlotIndex::isSameInstr(Def, I->start)) {
-    assert(I->valno->def == I->start && "Inconsistent existing value def");
-
-    // It is possible to have both normal and early-clobber defs of the same
-    // register on an instruction. It doesn't make a lot of sense, but it is
-    // possible to specify in inline assembly.
-    //
-    // Just convert everything to early-clobber.
-    Def = std::min(Def, I->start);
-    if (Def != I->start)
-      I->start = I->valno->def = Def;
-    return I->valno;
-  }
-  assert(SlotIndex::isEarlierInstr(Def, I->start) && "Already live at def");
-  VNInfo *VNI = getNextValue(Def, VNInfoAllocator);
-  segments.insert(I, Segment(Def, Def.getDeadSlot(), VNI));
-  return VNI;
+  // Use the segment set, if it is available.
+  if (segmentSet != nullptr)
+    return CalcLiveRangeUtilSet(this).createDeadDef(Def, VNInfoAllocator);
+  // Otherwise use the segment vector.
+  return CalcLiveRangeUtilVector(this).createDeadDef(Def, VNInfoAllocator);
 }
 
 // overlaps - Return true if the intersection of the two live ranges is
@@ -185,6 +433,27 @@
   return I != begin() && (--I)->end > Start;
 }
 
+bool LiveRange::covers(const LiveRange &Other) const {
+  if (empty())
+    return Other.empty();
+
+  const_iterator I = begin();
+  for (const Segment &O : Other.segments) {
+    I = advanceTo(I, O.start);
+    if (I == end() || I->start > O.start)
+      return false;
+
+    // Check adjacent live segments and see if we can get behind O.end.
+    while (I->end < O.end) {
+      const_iterator Last = I;
+      // Get next segment and abort if it was not adjacent.
+      ++I;
+      if (I == end() || Last->end != I->start)
+        return false;
+    }
+  }
+  return true;
+}
 
 /// ValNo is dead, remove it.  If it is the largest value number, just nuke it
 /// (and any other deleted values neighboring it), otherwise mark it as ~1U so
@@ -204,8 +473,8 @@
 void LiveRange::RenumberValues() {
   SmallPtrSet<VNInfo*, 8> Seen;
   valnos.clear();
-  for (const_iterator I = begin(), E = end(); I != E; ++I) {
-    VNInfo *VNI = I->valno;
+  for (const Segment &S : segments) {
+    VNInfo *VNI = S.valno;
     if (!Seen.insert(VNI).second)
       continue;
     assert(!VNI->isUnused() && "Unused valno used by live segment");
@@ -214,133 +483,35 @@
   }
 }
 
-/// This method is used when we want to extend the segment specified by I to end
-/// at the specified endpoint.  To do this, we should merge and eliminate all
-/// segments that this will overlap with.  The iterator is not invalidated.
-void LiveRange::extendSegmentEndTo(iterator I, SlotIndex NewEnd) {
-  assert(I != end() && "Not a valid segment!");
-  VNInfo *ValNo = I->valno;
-
-  // Search for the first segment that we can't merge with.
-  iterator MergeTo = std::next(I);
-  for (; MergeTo != end() && NewEnd >= MergeTo->end; ++MergeTo) {
-    assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
-  }
-
-  // If NewEnd was in the middle of a segment, make sure to get its endpoint.
-  I->end = std::max(NewEnd, std::prev(MergeTo)->end);
-
-  // If the newly formed segment now touches the segment after it and if they
-  // have the same value number, merge the two segments into one segment.
-  if (MergeTo != end() && MergeTo->start <= I->end &&
-      MergeTo->valno == ValNo) {
-    I->end = MergeTo->end;
-    ++MergeTo;
-  }
-
-  // Erase any dead segments.
-  segments.erase(std::next(I), MergeTo);
+void LiveRange::addSegmentToSet(Segment S) {
+  CalcLiveRangeUtilSet(this).addSegment(S);
 }
 
-
-/// This method is used when we want to extend the segment specified by I to
-/// start at the specified endpoint.  To do this, we should merge and eliminate
-/// all segments that this will overlap with.
-LiveRange::iterator
-LiveRange::extendSegmentStartTo(iterator I, SlotIndex NewStart) {
-  assert(I != end() && "Not a valid segment!");
-  VNInfo *ValNo = I->valno;
-
-  // Search for the first segment that we can't merge with.
-  iterator MergeTo = I;
-  do {
-    if (MergeTo == begin()) {
-      I->start = NewStart;
-      segments.erase(MergeTo, I);
-      return I;
-    }
-    assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
-    --MergeTo;
-  } while (NewStart <= MergeTo->start);
-
-  // If we start in the middle of another segment, just delete a range and
-  // extend that segment.
-  if (MergeTo->end >= NewStart && MergeTo->valno == ValNo) {
-    MergeTo->end = I->end;
-  } else {
-    // Otherwise, extend the segment right after.
-    ++MergeTo;
-    MergeTo->start = NewStart;
-    MergeTo->end = I->end;
+LiveRange::iterator LiveRange::addSegment(Segment S) {
+  // Use the segment set, if it is available.
+  if (segmentSet != nullptr) {
+    addSegmentToSet(S);
+    return end();
   }
-
-  segments.erase(std::next(MergeTo), std::next(I));
-  return MergeTo;
+  // Otherwise use the segment vector.
+  return CalcLiveRangeUtilVector(this).addSegment(S);
 }
 
-LiveRange::iterator LiveRange::addSegmentFrom(Segment S, iterator From) {
-  SlotIndex Start = S.start, End = S.end;
-  iterator it = std::upper_bound(From, end(), Start);
-
-  // If the inserted segment starts in the middle or right at the end of
-  // another segment, just extend that segment to contain the segment of S.
-  if (it != begin()) {
-    iterator B = std::prev(it);
-    if (S.valno == B->valno) {
-      if (B->start <= Start && B->end >= Start) {
-        extendSegmentEndTo(B, End);
-        return B;
-      }
-    } else {
-      // Check to make sure that we are not overlapping two live segments with
-      // different valno's.
-      assert(B->end <= Start &&
-             "Cannot overlap two segments with differing ValID's"
-             " (did you def the same reg twice in a MachineInstr?)");
-    }
-  }
-
-  // Otherwise, if this segment ends in the middle of, or right next to, another
-  // segment, merge it into that segment.
-  if (it != end()) {
-    if (S.valno == it->valno) {
-      if (it->start <= End) {
-        it = extendSegmentStartTo(it, Start);
-
-        // If S is a complete superset of a segment, we may need to grow its
-        // endpoint as well.
-        if (End > it->end)
-          extendSegmentEndTo(it, End);
-        return it;
-      }
-    } else {
-      // Check to make sure that we are not overlapping two live segments with
-      // different valno's.
-      assert(it->start >= End &&
-             "Cannot overlap two segments with differing ValID's");
-    }
-  }
-
-  // Otherwise, this is just a new segment that doesn't interact with anything.
-  // Insert it.
-  return segments.insert(it, S);
+void LiveRange::append(const Segment S) {
+  // Check that the segment belongs to the back of the list.
+  assert(segments.empty() || segments.back().end <= S.start);
+  segments.push_back(S);
 }
 
 /// extendInBlock - If this range is live before Kill in the basic
 /// block that starts at StartIdx, extend it to be live up to Kill and return
 /// the value. If there is no live range before Kill, return NULL.
 VNInfo *LiveRange::extendInBlock(SlotIndex StartIdx, SlotIndex Kill) {
-  if (empty())
-    return nullptr;
-  iterator I = std::upper_bound(begin(), end(), Kill.getPrevSlot());
-  if (I == begin())
-    return nullptr;
-  --I;
-  if (I->end <= StartIdx)
-    return nullptr;
-  if (I->end < Kill)
-    extendSegmentEndTo(I, Kill);
-  return I->valno;
+  // Use the segment set, if it is available.
+  if (segmentSet != nullptr)
+    return CalcLiveRangeUtilSet(this).extendInBlock(StartIdx, Kill);
+  // Otherwise use the segment vector.
+  return CalcLiveRangeUtilVector(this).extendInBlock(StartIdx, Kill);
 }
 
 /// Remove the specified segment from this range.  Note that the segment must
@@ -461,8 +632,8 @@
   // This can leave Other in an invalid state because we're not coalescing
   // touching segments that now have identical values. That's OK since Other is
   // not supposed to be valid after calling join();
-  for (iterator I = Other.begin(), E = Other.end(); I != E; ++I)
-    I->valno = NewVNInfo[RHSValNoAssignments[I->valno->id]];
+  for (Segment &S : Other.segments)
+    S.valno = NewVNInfo[RHSValNoAssignments[S.valno->id]];
 
   // Update val# info. Renumber them and make sure they all belong to this
   // LiveRange now. Also remove dead val#'s.
@@ -482,8 +653,8 @@
 
   // Okay, now insert the RHS live segments into the LHS.
   LiveRangeUpdater Updater(this);
-  for (iterator I = Other.begin(), E = Other.end(); I != E; ++I)
-    Updater.add(*I);
+  for (Segment &S : Other.segments)
+    Updater.add(S);
 }
 
 /// Merge all of the segments in RHS into this live range as the specified
@@ -493,8 +664,8 @@
 void LiveRange::MergeSegmentsInAsValue(const LiveRange &RHS,
                                        VNInfo *LHSValNo) {
   LiveRangeUpdater Updater(this);
-  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I)
-    Updater.add(I->start, I->end, LHSValNo);
+  for (const Segment &S : RHS.segments)
+    Updater.add(S.start, S.end, LHSValNo);
 }
 
 /// MergeValueInAsValue - Merge all of the live segments of a specific val#
@@ -506,9 +677,9 @@
                                     const VNInfo *RHSValNo,
                                     VNInfo *LHSValNo) {
   LiveRangeUpdater Updater(this);
-  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I)
-    if (I->valno == RHSValNo)
-      Updater.add(I->start, I->end, LHSValNo);
+  for (const Segment &S : RHS.segments)
+    if (S.valno == RHSValNo)
+      Updater.add(S.start, S.end, LHSValNo);
 }
 
 /// MergeValueNumberInto - This method is called when two value nubmers
@@ -570,10 +741,258 @@
   return V2;
 }
 
+void LiveRange::flushSegmentSet() {
+  assert(segmentSet != nullptr && "segment set must have been created");
+  assert(
+      segments.empty() &&
+      "segment set can be used only initially before switching to the array");
+  segments.append(segmentSet->begin(), segmentSet->end());
+  delete segmentSet;
+  segmentSet = nullptr;
+  verify();
+}
+
+void LiveInterval::freeSubRange(SubRange *S) {
+  S->~SubRange();
+  // Memory was allocated with BumpPtr allocator and is not freed here.
+}
+
+void LiveInterval::removeEmptySubRanges() {
+  SubRange **NextPtr = &SubRanges;
+  SubRange *I = *NextPtr;
+  while (I != nullptr) {
+    if (!I->empty()) {
+      NextPtr = &I->Next;
+      I = *NextPtr;
+      continue;
+    }
+    // Skip empty subranges until we find the first nonempty one.
+    do {
+      SubRange *Next = I->Next;
+      freeSubRange(I);
+      I = Next;
+    } while (I != nullptr && I->empty());
+    *NextPtr = I;
+  }
+}
+
+void LiveInterval::clearSubRanges() {
+  for (SubRange *I = SubRanges, *Next; I != nullptr; I = Next) {
+    Next = I->Next;
+    freeSubRange(I);
+  }
+  SubRanges = nullptr;
+}
+
+/// Helper function for constructMainRangeFromSubranges(): Search the CFG
+/// backwards until we find a place covered by a LiveRange segment that actually
+/// has a valno set.
+static VNInfo *searchForVNI(const SlotIndexes &Indexes, LiveRange &LR,
+    const MachineBasicBlock *MBB,
+    SmallPtrSetImpl<const MachineBasicBlock*> &Visited) {
+  // We start the search at the end of MBB.
+  SlotIndex EndIdx = Indexes.getMBBEndIdx(MBB);
+  // In our use case we can't live the area covered by the live segments without
+  // finding an actual VNI def.
+  LiveRange::iterator I = LR.find(EndIdx.getPrevSlot());
+  assert(I != LR.end());
+  LiveRange::Segment &S = *I;
+  if (S.valno != nullptr)
+    return S.valno;
+
+  VNInfo *VNI = nullptr;
+  // Continue at predecessors (we could even go to idom with domtree available).
+  for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+    // Avoid going in circles.
+    if (!Visited.insert(Pred).second)
+      continue;
+
+    VNI = searchForVNI(Indexes, LR, Pred, Visited);
+    if (VNI != nullptr) {
+      S.valno = VNI;
+      break;
+    }
+  }
+
+  return VNI;
+}
+
+static void determineMissingVNIs(const SlotIndexes &Indexes, LiveInterval &LI) {
+  SmallPtrSet<const MachineBasicBlock*, 5> Visited;
+  for (LiveRange::Segment &S : LI.segments) {
+    if (S.valno != nullptr)
+      continue;
+    // This can only happen at the begin of a basic block.
+    assert(S.start.isBlock() && "valno should only be missing at block begin");
+
+    Visited.clear();
+    const MachineBasicBlock *MBB = Indexes.getMBBFromIndex(S.start);
+    for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+      VNInfo *VNI = searchForVNI(Indexes, LI, Pred, Visited);
+      if (VNI != nullptr) {
+        S.valno = VNI;
+        break;
+      }
+    }
+    assert(S.valno != nullptr && "could not determine valno");
+  }
+}
+
+void LiveInterval::constructMainRangeFromSubranges(
+    const SlotIndexes &Indexes, VNInfo::Allocator &VNIAllocator) {
+  // The basic observations on which this algorithm is based:
+  // - Each Def/ValNo in a subrange must have a corresponding def on the main
+  //   range, but not further defs/valnos are necessary.
+  // - If any of the subranges is live at a point the main liverange has to be
+  //   live too, conversily if no subrange is live the main range mustn't be
+  //   live either.
+  // We do this by scannig through all the subranges simultaneously creating new
+  // segments in the main range as segments start/ends come up in the subranges.
+  assert(hasSubRanges() && "expected subranges to be present");
+  assert(segments.empty() && valnos.empty() && "expected empty main range");
+
+  // Collect subrange, iterator pairs for the walk and determine first and last
+  // SlotIndex involved.
+  SmallVector<std::pair<const SubRange*, const_iterator>, 4> SRs;
+  SlotIndex First;
+  SlotIndex Last;
+  for (const SubRange &SR : subranges()) {
+    if (SR.empty())
+      continue;
+    SRs.push_back(std::make_pair(&SR, SR.begin()));
+    if (!First.isValid() || SR.segments.front().start < First)
+      First = SR.segments.front().start;
+    if (!Last.isValid() || SR.segments.back().end > Last)
+      Last = SR.segments.back().end;
+  }
+
+  // Walk over all subranges simultaneously.
+  Segment CurrentSegment;
+  bool ConstructingSegment = false;
+  bool NeedVNIFixup = false;
+  unsigned ActiveMask = 0;
+  SlotIndex Pos = First;
+  while (true) {
+    SlotIndex NextPos = Last;
+    enum {
+      NOTHING,
+      BEGIN_SEGMENT,
+      END_SEGMENT,
+    } Event = NOTHING;
+    // Which subregister lanes are affected by the current event.
+    unsigned EventMask = 0;
+    // Whether a BEGIN_SEGMENT is also a valno definition point.
+    bool IsDef = false;
+    // Find the next begin or end of a subrange segment. Combine masks if we
+    // have multiple begins/ends at the same position. Ends take precedence over
+    // Begins.
+    for (auto &SRP : SRs) {
+      const SubRange &SR = *SRP.first;
+      const_iterator &I = SRP.second;
+      // Advance iterator of subrange to a segment involving Pos; the earlier
+      // segments are already merged at this point.
+      while (I != SR.end() &&
+             (I->end < Pos ||
+              (I->end == Pos && (ActiveMask & SR.LaneMask) == 0)))
+        ++I;
+      if (I == SR.end())
+        continue;
+      if ((ActiveMask & SR.LaneMask) == 0 &&
+          Pos <= I->start && I->start <= NextPos) {
+        // Merge multiple begins at the same position.
+        if (I->start == NextPos && Event == BEGIN_SEGMENT) {
+          EventMask |= SR.LaneMask;
+          IsDef |= I->valno->def == I->start;
+        } else if (I->start < NextPos || Event != END_SEGMENT) {
+          Event = BEGIN_SEGMENT;
+          NextPos = I->start;
+          EventMask = SR.LaneMask;
+          IsDef = I->valno->def == I->start;
+        }
+      }
+      if ((ActiveMask & SR.LaneMask) != 0 &&
+          Pos <= I->end && I->end <= NextPos) {
+        // Merge multiple ends at the same position.
+        if (I->end == NextPos && Event == END_SEGMENT)
+          EventMask |= SR.LaneMask;
+        else {
+          Event = END_SEGMENT;
+          NextPos = I->end;
+          EventMask = SR.LaneMask;
+        }
+      }
+    }
+
+    // Advance scan position.
+    Pos = NextPos;
+    if (Event == BEGIN_SEGMENT) {
+      if (ConstructingSegment && IsDef) {
+        // Finish previous segment because we have to start a new one.
+        CurrentSegment.end = Pos;
+        append(CurrentSegment);
+        ConstructingSegment = false;
+      }
+
+      // Start a new segment if necessary.
+      if (!ConstructingSegment) {
+        // Determine value number for the segment.
+        VNInfo *VNI;
+        if (IsDef) {
+          VNI = getNextValue(Pos, VNIAllocator);
+        } else {
+          // We have to reuse an existing value number, if we are lucky
+          // then we already passed one of the predecessor blocks and determined
+          // its value number (with blocks in reverse postorder this would be
+          // always true but we have no such guarantee).
+          assert(Pos.isBlock());
+          const MachineBasicBlock *MBB = Indexes.getMBBFromIndex(Pos);
+          // See if any of the predecessor blocks has a lower number and a VNI
+          for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+            SlotIndex PredEnd = Indexes.getMBBEndIdx(Pred);
+            VNI = getVNInfoBefore(PredEnd);
+            if (VNI != nullptr)
+              break;
+          }
+          // Def will come later: We have to do an extra fixup pass.
+          if (VNI == nullptr)
+            NeedVNIFixup = true;
+        }
+
+        CurrentSegment.start = Pos;
+        CurrentSegment.valno = VNI;
+        ConstructingSegment = true;
+      }
+      ActiveMask |= EventMask;
+    } else if (Event == END_SEGMENT) {
+      assert(ConstructingSegment);
+      // Finish segment if no lane is active anymore.
+      ActiveMask &= ~EventMask;
+      if (ActiveMask == 0) {
+        CurrentSegment.end = Pos;
+        append(CurrentSegment);
+        ConstructingSegment = false;
+      }
+    } else {
+      // We reached the end of the last subranges and can stop.
+      assert(Event == NOTHING);
+      break;
+    }
+  }
+
+  // We might not be able to assign new valnos for all segments if the basic
+  // block containing the definition comes after a segment using the valno.
+  // Do a fixup pass for this uncommon case.
+  if (NeedVNIFixup)
+    determineMissingVNIs(Indexes, *this);
+
+  assert(ActiveMask == 0 && !ConstructingSegment && "all segments ended");
+  verify();
+}
+
 unsigned LiveInterval::getSize() const {
   unsigned Sum = 0;
-  for (const_iterator I = begin(), E = end(); I != E; ++I)
-    Sum += I->start.distance(I->end);
+  for (const Segment &S : segments)
+    Sum += S.start.distance(S.end);
   return Sum;
 }
 
@@ -591,9 +1010,9 @@
   if (empty())
     OS << "EMPTY";
   else {
-    for (const_iterator I = begin(), E = end(); I != E; ++I) {
-      OS << *I;
-      assert(I->valno == getValNumInfo(I->valno->id) && "Bad VNInfo");
+    for (const Segment &S : segments) {
+      OS << S;
+      assert(S.valno == getValNumInfo(S.valno->id) && "Bad VNInfo");
     }
   }
 
@@ -620,6 +1039,10 @@
 void LiveInterval::print(raw_ostream &OS) const {
   OS << PrintReg(reg) << ' ';
   super::print(OS);
+  // Print subranges
+  for (const SubRange &SR : subranges()) {
+    OS << format(" L%04X ", SR.LaneMask) << SR;
+  }
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -648,6 +1071,26 @@
     }
   }
 }
+
+void LiveInterval::verify(const MachineRegisterInfo *MRI) const {
+  super::verify();
+
+  // Make sure SubRanges are fine and LaneMasks are disjunct.
+  unsigned Mask = 0;
+  unsigned MaxMask = MRI != nullptr ? MRI->getMaxLaneMaskForVReg(reg) : ~0u;
+  for (const SubRange &SR : subranges()) {
+    // Subrange lanemask should be disjunct to any previous subrange masks.
+    assert((Mask & SR.LaneMask) == 0);
+    Mask |= SR.LaneMask;
+
+    // subrange mask should not contained in maximum lane mask for the vreg.
+    assert((Mask & ~MaxMask) == 0);
+
+    SR.verify();
+    // Main liverange should cover subrange.
+    assert(covers(SR));
+  }
+}
 #endif
 
 
@@ -692,14 +1135,14 @@
   OS << " updater with gap = " << (ReadI - WriteI)
      << ", last start = " << LastStart
      << ":\n  Area 1:";
-  for (LiveRange::const_iterator I = LR->begin(); I != WriteI; ++I)
-    OS << ' ' << *I;
+  for (const auto &S : make_range(LR->begin(), WriteI))
+    OS << ' ' << S;
   OS << "\n  Spills:";
   for (unsigned I = 0, E = Spills.size(); I != E; ++I)
     OS << ' ' << Spills[I];
   OS << "\n  Area 2:";
-  for (LiveRange::const_iterator I = ReadI, E = LR->end(); I != E; ++I)
-    OS << ' ' << *I;
+  for (const auto &S : make_range(ReadI, LR->end()))
+    OS << ' ' << S;
   OS << '\n';
 }
 
@@ -723,6 +1166,13 @@
 void LiveRangeUpdater::add(LiveRange::Segment Seg) {
   assert(LR && "Cannot add to a null destination");
 
+  // Fall back to the regular add method if the live range
+  // is using the segment set instead of the segment vector.
+  if (LR->segmentSet != nullptr) {
+    LR->addSegmentToSet(Seg);
+    return;
+  }
+
   // Flush the state if Start moves backwards.
   if (!LastStart.isValid() || LastStart > Seg.start) {
     if (isDirty())
@@ -860,9 +1310,7 @@
   const VNInfo *used = nullptr, *unused = nullptr;
 
   // Determine connections.
-  for (LiveInterval::const_vni_iterator I = LI->vni_begin(), E = LI->vni_end();
-       I != E; ++I) {
-    const VNInfo *VNI = *I;
+  for (const VNInfo *VNI : LI->valnos) {
     // Group all unused values into one class.
     if (VNI->isUnused()) {
       if (unused)
@@ -938,6 +1386,8 @@
     } else
       *J++ = *I;
   }
+  // TODO: do not cheat anymore by simply cleaning all subranges
+  LI.clearSubRanges();
   LI.segments.erase(J, E);
 
   // Transfer VNInfos to their new owners and renumber them.

diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 1742e63..cc08045 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp

@@ -32,6 +32,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -62,6 +63,17 @@
 static bool EnablePrecomputePhysRegs = false;
 #endif // NDEBUG
 
+static cl::opt<bool> EnableSubRegLiveness(
+  "enable-subreg-liveness", cl::Hidden, cl::init(true),
+  cl::desc("Enable subregister liveness tracking."));
+
+namespace llvm {
+cl::opt<bool> UseSegmentSetForPhysRegs(
+    "use-segment-set-for-physregs", cl::Hidden, cl::init(true),
+    cl::desc(
+        "Use segment set for the computation of the live ranges of physregs."));
+}
+
 void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addRequired<AliasAnalysis>();
@@ -115,6 +127,10 @@
   AA = &getAnalysis<AliasAnalysis>();
   Indexes = &getAnalysis<SlotIndexes>();
   DomTree = &getAnalysis<MachineDominatorTree>();
+
+  if (EnableSubRegLiveness && MF->getSubtarget().enableSubRegLiveness())
+    MRI->enableSubRegLiveness(true);
+
   if (!LRCalc)
     LRCalc = new LiveRangeCalc();
 
@@ -183,9 +199,8 @@
   assert(LRCalc && "LRCalc not initialized.");
   assert(LI.empty() && "Should only compute empty intervals.");
   LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
-  LRCalc->createDeadDefs(LI);
-  LRCalc->extendToUses(LI);
-  computeDeadValues(&LI, LI, nullptr, nullptr);
+  LRCalc->calculate(LI);
+  computeDeadValues(LI, nullptr);
 }
 
 void LiveIntervals::computeVirtRegs() {
@@ -260,6 +275,10 @@
         LRCalc->extendToUses(LR, Reg);
     }
   }
+
+  // Flush the segment set to the segment vector.
+  if (UseSegmentSetForPhysRegs)
+    LR.flushSegmentSet();
 }
 
 
@@ -292,7 +311,8 @@
         unsigned Unit = *Units;
         LiveRange *LR = RegUnitRanges[Unit];
         if (!LR) {
-          LR = RegUnitRanges[Unit] = new LiveRange();
+          // Use segment set to speed-up initial computation of the live range.
+          LR = RegUnitRanges[Unit] = new LiveRange(UseSegmentSetForPhysRegs);
           NewRanges.push_back(Unit);
         }
         VNInfo *VNI = LR->createDeadDef(Begin, getVNInfoAllocator());
@@ -312,6 +332,70 @@
 }
 
 
+static void createSegmentsForValues(LiveRange &LR,
+      iterator_range<LiveInterval::vni_iterator> VNIs) {
+  for (auto VNI : VNIs) {
+    if (VNI->isUnused())
+      continue;
+    SlotIndex Def = VNI->def;
+    LR.addSegment(LiveRange::Segment(Def, Def.getDeadSlot(), VNI));
+  }
+}
+
+typedef SmallVector<std::pair<SlotIndex, VNInfo*>, 16> ShrinkToUsesWorkList;
+
+static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,
+                                 ShrinkToUsesWorkList &WorkList,
+                                 const LiveRange &OldRange) {
+  // Keep track of the PHIs that are in use.
+  SmallPtrSet<VNInfo*, 8> UsedPHIs;
+  // Blocks that have already been added to WorkList as live-out.
+  SmallPtrSet<MachineBasicBlock*, 16> LiveOut;
+
+  // Extend intervals to reach all uses in WorkList.
+  while (!WorkList.empty()) {
+    SlotIndex Idx = WorkList.back().first;
+    VNInfo *VNI = WorkList.back().second;
+    WorkList.pop_back();
+    const MachineBasicBlock *MBB = Indexes.getMBBFromIndex(Idx.getPrevSlot());
+    SlotIndex BlockStart = Indexes.getMBBStartIdx(MBB);
+
+    // Extend the live range for VNI to be live at Idx.
+    if (VNInfo *ExtVNI = LR.extendInBlock(BlockStart, Idx)) {
+      assert(ExtVNI == VNI && "Unexpected existing value number");
+      (void)ExtVNI;
+      // Is this a PHIDef we haven't seen before?
+      if (!VNI->isPHIDef() || VNI->def != BlockStart ||
+          !UsedPHIs.insert(VNI).second)
+        continue;
+      // The PHI is live, make sure the predecessors are live-out.
+      for (auto &Pred : MBB->predecessors()) {
+        if (!LiveOut.insert(Pred).second)
+          continue;
+        SlotIndex Stop = Indexes.getMBBEndIdx(Pred);
+        // A predecessor is not required to have a live-out value for a PHI.
+        if (VNInfo *PVNI = OldRange.getVNInfoBefore(Stop))
+          WorkList.push_back(std::make_pair(Stop, PVNI));
+      }
+      continue;
+    }
+
+    // VNI is live-in to MBB.
+    DEBUG(dbgs() << " live-in at " << BlockStart << '\n');
+    LR.addSegment(LiveRange::Segment(BlockStart, Idx, VNI));
+
+    // Make sure VNI is live-out from the predecessors.
+    for (auto &Pred : MBB->predecessors()) {
+      if (!LiveOut.insert(Pred).second)
+        continue;
+      SlotIndex Stop = Indexes.getMBBEndIdx(Pred);
+      assert(OldRange.getVNInfoBefore(Stop) == VNI &&
+             "Wrong value out of predecessor");
+      WorkList.push_back(std::make_pair(Stop, VNI));
+    }
+  }
+}
+
 /// shrinkToUses - After removing some uses of a register, shrink its live
 /// range to just the remaining uses. This method does not compute reaching
 /// defs for new uses, and it doesn't remove dead defs.
@@ -320,11 +404,14 @@
   DEBUG(dbgs() << "Shrink: " << *li << '\n');
   assert(TargetRegisterInfo::isVirtualRegister(li->reg)
          && "Can only shrink virtual registers");
-  // Find all the values used, including PHI kills.
-  SmallVector<std::pair<SlotIndex, VNInfo*>, 16> WorkList;
 
-  // Blocks that have already been added to WorkList as live-out.
-  SmallPtrSet<MachineBasicBlock*, 16> LiveOut;
+  // Shrink subregister live ranges.
+  for (LiveInterval::SubRange &S : li->subranges()) {
+    shrinkToUses(S, li->reg);
+  }
+
+  // Find all the values used, including PHI kills.
+  ShrinkToUsesWorkList WorkList;
 
   // Visit all instructions reading li->reg.
   for (MachineRegisterInfo::reg_instr_iterator
@@ -355,103 +442,126 @@
 
   // Create new live ranges with only minimal live segments per def.
   LiveRange NewLR;
-  for (LiveInterval::vni_iterator I = li->vni_begin(), E = li->vni_end();
-       I != E; ++I) {
-    VNInfo *VNI = *I;
-    if (VNI->isUnused())
-      continue;
-    NewLR.addSegment(LiveRange::Segment(VNI->def, VNI->def.getDeadSlot(), VNI));
-  }
-
-  // Keep track of the PHIs that are in use.
-  SmallPtrSet<VNInfo*, 8> UsedPHIs;
-
-  // Extend intervals to reach all uses in WorkList.
-  while (!WorkList.empty()) {
-    SlotIndex Idx = WorkList.back().first;
-    VNInfo *VNI = WorkList.back().second;
-    WorkList.pop_back();
-    const MachineBasicBlock *MBB = getMBBFromIndex(Idx.getPrevSlot());
-    SlotIndex BlockStart = getMBBStartIdx(MBB);
-
-    // Extend the live range for VNI to be live at Idx.
-    if (VNInfo *ExtVNI = NewLR.extendInBlock(BlockStart, Idx)) {
-      (void)ExtVNI;
-      assert(ExtVNI == VNI && "Unexpected existing value number");
-      // Is this a PHIDef we haven't seen before?
-      if (!VNI->isPHIDef() || VNI->def != BlockStart ||
-          !UsedPHIs.insert(VNI).second)
-        continue;
-      // The PHI is live, make sure the predecessors are live-out.
-      for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
-           PE = MBB->pred_end(); PI != PE; ++PI) {
-        if (!LiveOut.insert(*PI).second)
-          continue;
-        SlotIndex Stop = getMBBEndIdx(*PI);
-        // A predecessor is not required to have a live-out value for a PHI.
-        if (VNInfo *PVNI = li->getVNInfoBefore(Stop))
-          WorkList.push_back(std::make_pair(Stop, PVNI));
-      }
-      continue;
-    }
-
-    // VNI is live-in to MBB.
-    DEBUG(dbgs() << " live-in at " << BlockStart << '\n');
-    NewLR.addSegment(LiveRange::Segment(BlockStart, Idx, VNI));
-
-    // Make sure VNI is live-out from the predecessors.
-    for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
-         PE = MBB->pred_end(); PI != PE; ++PI) {
-      if (!LiveOut.insert(*PI).second)
-        continue;
-      SlotIndex Stop = getMBBEndIdx(*PI);
-      assert(li->getVNInfoBefore(Stop) == VNI &&
-             "Wrong value out of predecessor");
-      WorkList.push_back(std::make_pair(Stop, VNI));
-    }
-  }
-
-  // Handle dead values.
-  bool CanSeparate = false;
-  computeDeadValues(li, NewLR, &CanSeparate, dead);
+  createSegmentsForValues(NewLR, make_range(li->vni_begin(), li->vni_end()));
+  extendSegmentsToUses(NewLR, *Indexes, WorkList, *li);
 
   // Move the trimmed segments back.
   li->segments.swap(NewLR.segments);
+
+  // Handle dead values.
+  bool CanSeparate = computeDeadValues(*li, dead);
   DEBUG(dbgs() << "Shrunk: " << *li << '\n');
   return CanSeparate;
 }
 
-void LiveIntervals::computeDeadValues(LiveInterval *li,
-                                      LiveRange &LR,
-                                      bool *CanSeparate,
+bool LiveIntervals::computeDeadValues(LiveInterval &LI,
                                       SmallVectorImpl<MachineInstr*> *dead) {
-  for (LiveInterval::vni_iterator I = li->vni_begin(), E = li->vni_end();
-       I != E; ++I) {
-    VNInfo *VNI = *I;
+  bool PHIRemoved = false;
+  for (auto VNI : LI.valnos) {
     if (VNI->isUnused())
       continue;
-    LiveRange::iterator LRI = LR.FindSegmentContaining(VNI->def);
-    assert(LRI != LR.end() && "Missing segment for PHI");
-    if (LRI->end != VNI->def.getDeadSlot())
+    SlotIndex Def = VNI->def;
+    LiveRange::iterator I = LI.FindSegmentContaining(Def);
+    assert(I != LI.end() && "Missing segment for VNI");
+
+    // Is the register live before? Otherwise we may have to add a read-undef
+    // flag for subregister defs.
+    if (MRI->tracksSubRegLiveness()) {
+      if ((I == LI.begin() || std::prev(I)->end < Def) && !VNI->isPHIDef()) {
+        MachineInstr *MI = getInstructionFromIndex(Def);
+        MI->addRegisterDefReadUndef(LI.reg);
+      }
+    }
+
+    if (I->end != Def.getDeadSlot())
       continue;
     if (VNI->isPHIDef()) {
       // This is a dead PHI. Remove it.
       VNI->markUnused();
-      LR.removeSegment(LRI->start, LRI->end);
-      DEBUG(dbgs() << "Dead PHI at " << VNI->def << " may separate interval\n");
-      if (CanSeparate)
-        *CanSeparate = true;
+      LI.removeSegment(I);
+      DEBUG(dbgs() << "Dead PHI at " << Def << " may separate interval\n");
+      PHIRemoved = true;
     } else {
       // This is a dead def. Make sure the instruction knows.
-      MachineInstr *MI = getInstructionFromIndex(VNI->def);
+      MachineInstr *MI = getInstructionFromIndex(Def);
       assert(MI && "No instruction defining live value");
-      MI->addRegisterDead(li->reg, TRI);
+      MI->addRegisterDead(LI.reg, TRI);
       if (dead && MI->allDefsAreDead()) {
-        DEBUG(dbgs() << "All defs dead: " << VNI->def << '\t' << *MI);
+        DEBUG(dbgs() << "All defs dead: " << Def << '\t' << *MI);
         dead->push_back(MI);
       }
     }
   }
+  return PHIRemoved;
+}
+
+void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg)
+{
+  DEBUG(dbgs() << "Shrink: " << SR << '\n');
+  assert(TargetRegisterInfo::isVirtualRegister(Reg)
+         && "Can only shrink virtual registers");
+  // Find all the values used, including PHI kills.
+  ShrinkToUsesWorkList WorkList;
+
+  // Visit all instructions reading Reg.
+  SlotIndex LastIdx;
+  for (MachineOperand &MO : MRI->reg_operands(Reg)) {
+    MachineInstr *UseMI = MO.getParent();
+    if (UseMI->isDebugValue())
+      continue;
+    // Maybe the operand is for a subregister we don't care about.
+    unsigned SubReg = MO.getSubReg();
+    if (SubReg != 0) {
+      unsigned SubRegMask = TRI->getSubRegIndexLaneMask(SubReg);
+      if ((SubRegMask & SR.LaneMask) == 0)
+        continue;
+    }
+    // We only need to visit each instruction once.
+    SlotIndex Idx = getInstructionIndex(UseMI).getRegSlot();
+    if (Idx == LastIdx)
+      continue;
+    LastIdx = Idx;
+
+    LiveQueryResult LRQ = SR.Query(Idx);
+    VNInfo *VNI = LRQ.valueIn();
+    // For Subranges it is possible that only undef values are left in that
+    // part of the subregister, so there is no real liverange at the use
+    if (!VNI)
+      continue;
+
+    // Special case: An early-clobber tied operand reads and writes the
+    // register one slot early.
+    if (VNInfo *DefVNI = LRQ.valueDefined())
+      Idx = DefVNI->def;
+
+    WorkList.push_back(std::make_pair(Idx, VNI));
+  }
+
+  // Create a new live ranges with only minimal live segments per def.
+  LiveRange NewLR;
+  createSegmentsForValues(NewLR, make_range(SR.vni_begin(), SR.vni_end()));
+  extendSegmentsToUses(NewLR, *Indexes, WorkList, SR);
+
+  // Move the trimmed ranges back.
+  SR.segments.swap(NewLR.segments);
+
+  // Remove dead PHI value numbers
+  for (auto VNI : SR.valnos) {
+    if (VNI->isUnused())
+      continue;
+    const LiveRange::Segment *Segment = SR.getSegmentContaining(VNI->def);
+    assert(Segment != nullptr && "Missing segment for VNI");
+    if (Segment->end != VNI->def.getDeadSlot())
+      continue;
+    if (VNI->isPHIDef()) {
+      // This is a dead PHI. Remove it.
+      VNI->markUnused();
+      SR.removeSegment(*Segment);
+      DEBUG(dbgs() << "Dead PHI at " << VNI->def << " may separate interval\n");
+    }
+  }
+
+  DEBUG(dbgs() << "Shrunk: " << SR << '\n');
 }
 
 void LiveIntervals::extendToIndices(LiveRange &LR,
@@ -462,26 +572,25 @@
     LRCalc->extend(LR, Indices[i]);
 }
 
-void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
+void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill,
                                SmallVectorImpl<SlotIndex> *EndPoints) {
-  LiveQueryResult LRQ = LI->Query(Kill);
-  VNInfo *VNI = LRQ.valueOut();
+  LiveQueryResult LRQ = LR.Query(Kill);
+  VNInfo *VNI = LRQ.valueOutOrDead();
   if (!VNI)
     return;
 
   MachineBasicBlock *KillMBB = Indexes->getMBBFromIndex(Kill);
-  SlotIndex MBBStart, MBBEnd;
-  std::tie(MBBStart, MBBEnd) = Indexes->getMBBRange(KillMBB);
+  SlotIndex MBBEnd = Indexes->getMBBEndIdx(KillMBB);
 
   // If VNI isn't live out from KillMBB, the value is trivially pruned.
   if (LRQ.endPoint() < MBBEnd) {
-    LI->removeSegment(Kill, LRQ.endPoint());
+    LR.removeSegment(Kill, LRQ.endPoint());
     if (EndPoints) EndPoints->push_back(LRQ.endPoint());
     return;
   }
 
   // VNI is live out of KillMBB.
-  LI->removeSegment(Kill, MBBEnd);
+  LR.removeSegment(Kill, MBBEnd);
   if (EndPoints) EndPoints->push_back(MBBEnd);
 
   // Find all blocks that are reachable from KillMBB without leaving VNI's live
@@ -498,8 +607,9 @@
       MachineBasicBlock *MBB = *I;
 
       // Check if VNI is live in to MBB.
+      SlotIndex MBBStart, MBBEnd;
       std::tie(MBBStart, MBBEnd) = Indexes->getMBBRange(MBB);
-      LiveQueryResult LRQ = LI->Query(MBBStart);
+      LiveQueryResult LRQ = LR.Query(MBBStart);
       if (LRQ.valueIn() != VNI) {
         // This block isn't part of the VNI segment. Prune the search.
         I.skipChildren();
@@ -508,14 +618,14 @@
 
       // Prune the search if VNI is killed in MBB.
       if (LRQ.endPoint() < MBBEnd) {
-        LI->removeSegment(MBBStart, LRQ.endPoint());
+        LR.removeSegment(MBBStart, LRQ.endPoint());
         if (EndPoints) EndPoints->push_back(LRQ.endPoint());
         I.skipChildren();
         continue;
       }
 
       // VNI is live through MBB.
-      LI->removeSegment(MBBStart, MBBEnd);
+      LR.removeSegment(MBBStart, MBBEnd);
       if (EndPoints) EndPoints->push_back(MBBEnd);
       ++I;
     }
@@ -528,14 +638,17 @@
 
 void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
   // Keep track of regunit ranges.
-  SmallVector<std::pair<LiveRange*, LiveRange::iterator>, 8> RU;
+  SmallVector<std::pair<const LiveRange*, LiveRange::const_iterator>, 8> RU;
+  // Keep track of subregister ranges.
+  SmallVector<std::pair<const LiveInterval::SubRange*,
+                        LiveRange::const_iterator>, 4> SRs;
 
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     if (MRI->reg_nodbg_empty(Reg))
       continue;
-    LiveInterval *LI = &getInterval(Reg);
-    if (LI->empty())
+    const LiveInterval &LI = getInterval(Reg);
+    if (LI.empty())
       continue;
 
     // Find the regunit intervals for the assigned register. They may overlap
@@ -543,15 +656,22 @@
     RU.clear();
     for (MCRegUnitIterator Units(VRM->getPhys(Reg), TRI); Units.isValid();
          ++Units) {
-      LiveRange &RURanges = getRegUnit(*Units);
-      if (RURanges.empty())
+      const LiveRange &RURange = getRegUnit(*Units);
+      if (RURange.empty())
         continue;
-      RU.push_back(std::make_pair(&RURanges, RURanges.find(LI->begin()->end)));
+      RU.push_back(std::make_pair(&RURange, RURange.find(LI.begin()->end)));
+    }
+
+    if (MRI->tracksSubRegLiveness()) {
+      SRs.clear();
+      for (const LiveInterval::SubRange &SR : LI.subranges()) {
+        SRs.push_back(std::make_pair(&SR, SR.find(LI.begin()->end)));
+      }
     }
 
     // Every instruction that kills Reg corresponds to a segment range end
     // point.
-    for (LiveInterval::iterator RI = LI->begin(), RE = LI->end(); RI != RE;
+    for (LiveInterval::const_iterator RI = LI.begin(), RE = LI.end(); RI != RE;
          ++RI) {
       // A block index indicates an MBB edge.
       if (RI->end.isBlock())
@@ -568,23 +688,80 @@
       //   BAR %EAX<kill>
       //
       // There should be no kill flag on FOO when %vreg5 is rewritten as %EAX.
-      bool CancelKill = false;
-      for (unsigned u = 0, e = RU.size(); u != e; ++u) {
-        LiveRange &RRanges = *RU[u].first;
-        LiveRange::iterator &I = RU[u].second;
-        if (I == RRanges.end())
+      for (auto &RUP : RU) {
+        const LiveRange &RURange = *RUP.first;
+        LiveRange::const_iterator &I = RUP.second;
+        if (I == RURange.end())
           continue;
-        I = RRanges.advanceTo(I, RI->end);
-        if (I == RRanges.end() || I->start >= RI->end)
+        I = RURange.advanceTo(I, RI->end);
+        if (I == RURange.end() || I->start >= RI->end)
           continue;
         // I is overlapping RI.
-        CancelKill = true;
-        break;
+        goto CancelKill;
       }
-      if (CancelKill)
-        MI->clearRegisterKills(Reg, nullptr);
-      else
-        MI->addRegisterKilled(Reg, nullptr);
+
+      if (MRI->tracksSubRegLiveness()) {
+        // When reading a partial undefined value we must not add a kill flag.
+        // The regalloc might have used the undef lane for something else.
+        // Example:
+        //     %vreg1 = ...              ; R32: %vreg1
+        //     %vreg2:high16 = ...       ; R64: %vreg2
+        //        = read %vreg2<kill>    ; R64: %vreg2
+        //        = read %vreg1          ; R32: %vreg1
+        // The <kill> flag is correct for %vreg2, but the register allocator may
+        // assign R0L to %vreg1, and R0 to %vreg2 because the low 32bits of R0
+        // are actually never written by %vreg2. After assignment the <kill>
+        // flag at the read instruction is invalid.
+        unsigned DefinedLanesMask;
+        if (!SRs.empty()) {
+          // Compute a mask of lanes that are defined.
+          DefinedLanesMask = 0;
+          for (auto &SRP : SRs) {
+            const LiveInterval::SubRange &SR = *SRP.first;
+            LiveRange::const_iterator &I = SRP.second;
+            if (I == SR.end())
+              continue;
+            I = SR.advanceTo(I, RI->end);
+            if (I == SR.end() || I->start >= RI->end)
+              continue;
+            // I is overlapping RI
+            DefinedLanesMask |= SR.LaneMask;
+          }
+        } else
+          DefinedLanesMask = ~0u;
+
+        bool IsFullWrite = false;
+        for (const MachineOperand &MO : MI->operands()) {
+          if (!MO.isReg() || MO.getReg() != Reg)
+            continue;
+          if (MO.isUse()) {
+            // Reading any undefined lanes?
+            unsigned UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
+            if ((UseMask & ~DefinedLanesMask) != 0)
+              goto CancelKill;
+          } else if (MO.getSubReg() == 0) {
+            // Writing to the full register?
+            assert(MO.isDef());
+            IsFullWrite = true;
+          }
+        }
+
+        // If an instruction writes to a subregister, a new segment starts in
+        // the LiveInterval. But as this is only overriding part of the register
+        // adding kill-flags is not correct here after registers have been
+        // assigned.
+        if (!IsFullWrite) {
+          // Next segment has to be adjacent in the subregister write case.
+          LiveRange::const_iterator N = std::next(RI);
+          if (N != LI.end() && N->start == RI->end)
+            goto CancelKill;
+        }
+      }
+
+      MI->addRegisterKilled(Reg, nullptr);
+      continue;
+CancelKill:
+      MI->clearRegisterKills(Reg, nullptr);
     }
   }
 }
@@ -615,9 +792,7 @@
 
 bool
 LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const {
-  for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
-       I != E; ++I) {
-    const VNInfo *PHI = *I;
+  for (const VNInfo *PHI : LI.valnos) {
     if (PHI->isUnused() || !PHI->isPHIDef())
       continue;
     const MachineBasicBlock *PHIMBB = getMBBFromIndex(PHI->def);
@@ -767,7 +942,16 @@
         continue;
       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
         LiveInterval &LI = LIS.getInterval(Reg);
-        updateRange(LI, Reg);
+        if (LI.hasSubRanges()) {
+          unsigned SubReg = MO->getSubReg();
+          unsigned LaneMask = TRI.getSubRegIndexLaneMask(SubReg);
+          for (LiveInterval::SubRange &S : LI.subranges()) {
+            if ((S.LaneMask & LaneMask) == 0)
+              continue;
+            updateRange(S, Reg, S.LaneMask);
+          }
+        }
+        updateRange(LI, Reg, 0);
         continue;
       }
 
@@ -775,7 +959,7 @@
       // precomputed live range.
       for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
         if (LiveRange *LR = getRegUnitLI(*Units))
-          updateRange(*LR, *Units);
+          updateRange(*LR, *Units, 0);
     }
     if (hasRegMask)
       updateRegMaskSlots();
@@ -784,21 +968,24 @@
 private:
   /// Update a single live range, assuming an instruction has been moved from
   /// OldIdx to NewIdx.
-  void updateRange(LiveRange &LR, unsigned Reg) {
+  void updateRange(LiveRange &LR, unsigned Reg, unsigned LaneMask) {
     if (!Updated.insert(&LR).second)
       return;
     DEBUG({
       dbgs() << "     ";
-      if (TargetRegisterInfo::isVirtualRegister(Reg))
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
         dbgs() << PrintReg(Reg);
-      else
+        if (LaneMask != 0)
+          dbgs() << format(" L%04X", LaneMask);
+      } else {
         dbgs() << PrintRegUnit(Reg, &TRI);
+      }
       dbgs() << ":\t" << LR << '\n';
     });
     if (SlotIndex::isEarlierInstr(OldIdx, NewIdx))
       handleMoveDown(LR);
     else
-      handleMoveUp(LR, Reg);
+      handleMoveUp(LR, Reg, LaneMask);
     DEBUG(dbgs() << "        -->\t" << LR << '\n');
     LR.verify();
   }
@@ -911,7 +1098,7 @@
   ///    Hoist kill to NewIdx, then scan for last kill between NewIdx and
   ///    OldIdx.
   ///
-  void handleMoveUp(LiveRange &LR, unsigned Reg) {
+  void handleMoveUp(LiveRange &LR, unsigned Reg, unsigned LaneMask) {
     // First look for a kill at OldIdx.
     LiveRange::iterator I = LR.find(OldIdx.getBaseIndex());
     LiveRange::iterator E = LR.end();
@@ -932,7 +1119,7 @@
       if (I == E || !SlotIndex::isSameInstr(I->start, OldIdx)) {
         // No def, search for the new kill.
         // This can never be an early clobber kill since there is no def.
-        std::prev(I)->end = findLastUseBefore(Reg).getRegSlot();
+        std::prev(I)->end = findLastUseBefore(Reg, LaneMask).getRegSlot();
         return;
       }
     }
@@ -988,15 +1175,17 @@
   }
 
   // Return the last use of reg between NewIdx and OldIdx.
-  SlotIndex findLastUseBefore(unsigned Reg) {
+  SlotIndex findLastUseBefore(unsigned Reg, unsigned LaneMask) {
 
     if (TargetRegisterInfo::isVirtualRegister(Reg)) {
       SlotIndex LastUse = NewIdx;
-      for (MachineRegisterInfo::use_instr_nodbg_iterator
-             UI = MRI.use_instr_nodbg_begin(Reg),
-             UE = MRI.use_instr_nodbg_end();
-           UI != UE; ++UI) {
-        const MachineInstr* MI = &*UI;
+      for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+        unsigned SubReg = MO.getSubReg();
+        if (SubReg != 0 && LaneMask != 0
+            && (TRI.getSubRegIndexLaneMask(SubReg) & LaneMask) == 0)
+          continue;
+
+        const MachineInstr *MI = MO.getParent();
         SlotIndex InstSlot = LIS.getSlotIndexes()->getInstructionIndex(MI);
         if (InstSlot > LastUse && InstSlot < OldIdx)
           LastUse = InstSlot;
@@ -1062,6 +1251,94 @@
   HME.updateAllRanges(MI);
 }
 
+void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin,
+                                        const MachineBasicBlock::iterator End,
+                                        const SlotIndex endIdx,
+                                        LiveRange &LR, const unsigned Reg,
+                                        const unsigned LaneMask) {
+  LiveInterval::iterator LII = LR.find(endIdx);
+  SlotIndex lastUseIdx;
+  if (LII != LR.end() && LII->start < endIdx)
+    lastUseIdx = LII->end;
+  else
+    --LII;
+
+  for (MachineBasicBlock::iterator I = End; I != Begin;) {
+    --I;
+    MachineInstr *MI = I;
+    if (MI->isDebugValue())
+      continue;
+
+    SlotIndex instrIdx = getInstructionIndex(MI);
+    bool isStartValid = getInstructionFromIndex(LII->start);
+    bool isEndValid = getInstructionFromIndex(LII->end);
+
+    // FIXME: This doesn't currently handle early-clobber or multiple removed
+    // defs inside of the region to repair.
+    for (MachineInstr::mop_iterator OI = MI->operands_begin(),
+         OE = MI->operands_end(); OI != OE; ++OI) {
+      const MachineOperand &MO = *OI;
+      if (!MO.isReg() || MO.getReg() != Reg)
+        continue;
+
+      unsigned SubReg = MO.getSubReg();
+      unsigned Mask = TRI->getSubRegIndexLaneMask(SubReg);
+      if ((Mask & LaneMask) == 0)
+        continue;
+
+      if (MO.isDef()) {
+        if (!isStartValid) {
+          if (LII->end.isDead()) {
+            SlotIndex prevStart;
+            if (LII != LR.begin())
+              prevStart = std::prev(LII)->start;
+
+            // FIXME: This could be more efficient if there was a
+            // removeSegment method that returned an iterator.
+            LR.removeSegment(*LII, true);
+            if (prevStart.isValid())
+              LII = LR.find(prevStart);
+            else
+              LII = LR.begin();
+          } else {
+            LII->start = instrIdx.getRegSlot();
+            LII->valno->def = instrIdx.getRegSlot();
+            if (MO.getSubReg() && !MO.isUndef())
+              lastUseIdx = instrIdx.getRegSlot();
+            else
+              lastUseIdx = SlotIndex();
+            continue;
+          }
+        }
+
+        if (!lastUseIdx.isValid()) {
+          VNInfo *VNI = LR.getNextValue(instrIdx.getRegSlot(), VNInfoAllocator);
+          LiveRange::Segment S(instrIdx.getRegSlot(),
+                               instrIdx.getDeadSlot(), VNI);
+          LII = LR.addSegment(S);
+        } else if (LII->start != instrIdx.getRegSlot()) {
+          VNInfo *VNI = LR.getNextValue(instrIdx.getRegSlot(), VNInfoAllocator);
+          LiveRange::Segment S(instrIdx.getRegSlot(), lastUseIdx, VNI);
+          LII = LR.addSegment(S);
+        }
+
+        if (MO.getSubReg() && !MO.isUndef())
+          lastUseIdx = instrIdx.getRegSlot();
+        else
+          lastUseIdx = SlotIndex();
+      } else if (MO.isUse()) {
+        // FIXME: This should probably be handled outside of this branch,
+        // either as part of the def case (for defs inside of the region) or
+        // after the loop over the region.
+        if (!isEndValid && !LII->end.isBlock())
+          LII->end = instrIdx.getRegSlot();
+        if (!lastUseIdx.isValid())
+          lastUseIdx = instrIdx.getRegSlot();
+      }
+    }
+  }
+}
+
 void
 LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
                                       MachineBasicBlock::iterator Begin,
@@ -1107,83 +1384,31 @@
     if (!LI.hasAtLeastOneValue())
       continue;
 
-    LiveInterval::iterator LII = LI.find(endIdx);
-    SlotIndex lastUseIdx;
-    if (LII != LI.end() && LII->start < endIdx)
-      lastUseIdx = LII->end;
-    else
-      --LII;
-
-    for (MachineBasicBlock::iterator I = End; I != Begin;) {
-      --I;
-      MachineInstr *MI = I;
-      if (MI->isDebugValue())
-        continue;
-
-      SlotIndex instrIdx = getInstructionIndex(MI);
-      bool isStartValid = getInstructionFromIndex(LII->start);
-      bool isEndValid = getInstructionFromIndex(LII->end);
-
-      // FIXME: This doesn't currently handle early-clobber or multiple removed
-      // defs inside of the region to repair.
-      for (MachineInstr::mop_iterator OI = MI->operands_begin(),
-           OE = MI->operands_end(); OI != OE; ++OI) {
-        const MachineOperand &MO = *OI;
-        if (!MO.isReg() || MO.getReg() != Reg)
-          continue;
-
-        if (MO.isDef()) {
-          if (!isStartValid) {
-            if (LII->end.isDead()) {
-              SlotIndex prevStart;
-              if (LII != LI.begin())
-                prevStart = std::prev(LII)->start;
-
-              // FIXME: This could be more efficient if there was a
-              // removeSegment method that returned an iterator.
-              LI.removeSegment(*LII, true);
-              if (prevStart.isValid())
-                LII = LI.find(prevStart);
-              else
-                LII = LI.begin();
-            } else {
-              LII->start = instrIdx.getRegSlot();
-              LII->valno->def = instrIdx.getRegSlot();
-              if (MO.getSubReg() && !MO.isUndef())
-                lastUseIdx = instrIdx.getRegSlot();
-              else
-                lastUseIdx = SlotIndex();
-              continue;
-            }
-          }
-
-          if (!lastUseIdx.isValid()) {
-            VNInfo *VNI = LI.getNextValue(instrIdx.getRegSlot(),
-                                          VNInfoAllocator);
-            LiveRange::Segment S(instrIdx.getRegSlot(),
-                                 instrIdx.getDeadSlot(), VNI);
-            LII = LI.addSegment(S);
-          } else if (LII->start != instrIdx.getRegSlot()) {
-            VNInfo *VNI = LI.getNextValue(instrIdx.getRegSlot(),
-                                          VNInfoAllocator);
-            LiveRange::Segment S(instrIdx.getRegSlot(), lastUseIdx, VNI);
-            LII = LI.addSegment(S);
-          }
-
-          if (MO.getSubReg() && !MO.isUndef())
-            lastUseIdx = instrIdx.getRegSlot();
-          else
-            lastUseIdx = SlotIndex();
-        } else if (MO.isUse()) {
-          // FIXME: This should probably be handled outside of this branch,
-          // either as part of the def case (for defs inside of the region) or
-          // after the loop over the region.
-          if (!isEndValid && !LII->end.isBlock())
-            LII->end = instrIdx.getRegSlot();
-          if (!lastUseIdx.isValid())
-            lastUseIdx = instrIdx.getRegSlot();
-        }
-      }
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      repairOldRegInRange(Begin, End, endIdx, S, Reg, S.LaneMask);
     }
+    repairOldRegInRange(Begin, End, endIdx, LI, Reg);
   }
 }
+
+void LiveIntervals::removePhysRegDefAt(unsigned Reg, SlotIndex Pos) {
+  for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+    if (LiveRange *LR = getCachedRegUnit(*Units))
+      if (VNInfo *VNI = LR->getVNInfoAt(Pos))
+        LR->removeValNo(VNI);
+  }
+}
+
+void LiveIntervals::removeVRegDefAt(LiveInterval &LI, SlotIndex Pos) {
+  VNInfo *VNI = LI.getVNInfoAt(Pos);
+  if (VNI == nullptr)
+    return;
+  LI.removeValNo(VNI);
+
+  // Also remove the value in subranges.
+  for (LiveInterval::SubRange &S : LI.subranges()) {
+    if (VNInfo *SVNI = S.getVNInfoAt(Pos))
+      S.removeValNo(SVNI);
+  }
+  LI.removeEmptySubRanges();
+}

diff --git a/lib/CodeGen/LiveIntervalUnion.cpp b/lib/CodeGen/LiveIntervalUnion.cpp
index d81221b..025d99c 100644
--- a/lib/CodeGen/LiveIntervalUnion.cpp
+++ b/lib/CodeGen/LiveIntervalUnion.cpp

@@ -26,14 +26,14 @@
 
 
 // Merge a LiveInterval's segments. Guarantee no overlaps.
-void LiveIntervalUnion::unify(LiveInterval &VirtReg) {
-  if (VirtReg.empty())
+void LiveIntervalUnion::unify(LiveInterval &VirtReg, const LiveRange &Range) {
+  if (Range.empty())
     return;
   ++Tag;
 
   // Insert each of the virtual register's live segments into the map.
-  LiveInterval::iterator RegPos = VirtReg.begin();
-  LiveInterval::iterator RegEnd = VirtReg.end();
+  LiveRange::const_iterator RegPos = Range.begin();
+  LiveRange::const_iterator RegEnd = Range.end();
   SegmentIter SegPos = Segments.find(RegPos->start);
 
   while (SegPos.valid()) {
@@ -53,14 +53,14 @@
 }
 
 // Remove a live virtual register's segments from this union.
-void LiveIntervalUnion::extract(LiveInterval &VirtReg) {
-  if (VirtReg.empty())
+void LiveIntervalUnion::extract(LiveInterval &VirtReg, const LiveRange &Range) {
+  if (Range.empty())
     return;
   ++Tag;
 
   // Remove each of the virtual register's live segments from the map.
-  LiveInterval::iterator RegPos = VirtReg.begin();
-  LiveInterval::iterator RegEnd = VirtReg.end();
+  LiveRange::const_iterator RegPos = Range.begin();
+  LiveRange::const_iterator RegEnd = Range.end();
   SegmentIter SegPos = Segments.find(RegPos->start);
 
   for (;;) {
@@ -70,7 +70,7 @@
       return;
 
     // Skip all segments that may have been coalesced.
-    RegPos = VirtReg.advanceTo(RegPos, SegPos.start());
+    RegPos = Range.advanceTo(RegPos, SegPos.start());
     if (RegPos == RegEnd)
       return;
 

diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index a558e14..d804b39 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp

@@ -19,6 +19,13 @@
 
 #define DEBUG_TYPE "regalloc"
 
+void LiveRangeCalc::resetLiveOutMap() {
+  unsigned NumBlocks = MF->getNumBlockIDs();
+  Seen.clear();
+  Seen.resize(NumBlocks);
+  Map.resize(NumBlocks);
+}
+
 void LiveRangeCalc::reset(const MachineFunction *mf,
                           SlotIndexes *SI,
                           MachineDominatorTree *MDT,
@@ -28,126 +35,207 @@
   Indexes = SI;
   DomTree = MDT;
   Alloc = VNIA;
-
-  unsigned N = MF->getNumBlockIDs();
-  Seen.clear();
-  Seen.resize(N);
-  LiveOut.resize(N);
+  resetLiveOutMap();
   LiveIn.clear();
 }
 
 
+static void createDeadDef(SlotIndexes &Indexes, VNInfo::Allocator &Alloc,
+                          LiveRange &LR, const MachineOperand &MO) {
+    const MachineInstr *MI = MO.getParent();
+    SlotIndex DefIdx =
+        Indexes.getInstructionIndex(MI).getRegSlot(MO.isEarlyClobber());
+
+    // Create the def in LR. This may find an existing def.
+    LR.createDeadDef(DefIdx, Alloc);
+}
+
+void LiveRangeCalc::calculate(LiveInterval &LI) {
+  assert(MRI && Indexes && "call reset() first");
+
+  // Step 1: Create minimal live segments for every definition of Reg.
+  // Visit all def operands. If the same instruction has multiple defs of Reg,
+  // createDeadDef() will deduplicate.
+  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
+  unsigned Reg = LI.reg;
+  for (const MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
+    if (!MO.isDef() && !MO.readsReg())
+      continue;
+
+    unsigned SubReg = MO.getSubReg();
+    if (LI.hasSubRanges() || (SubReg != 0 && MRI->tracksSubRegLiveness())) {
+      unsigned Mask = SubReg != 0 ? TRI.getSubRegIndexLaneMask(SubReg)
+                                  : MRI->getMaxLaneMaskForVReg(Reg);
+
+      // If this is the first time we see a subregister def, initialize
+      // subranges by creating a copy of the main range.
+      if (!LI.hasSubRanges() && !LI.empty()) {
+        unsigned ClassMask = MRI->getMaxLaneMaskForVReg(Reg);
+        LI.createSubRangeFrom(*Alloc, ClassMask, LI);
+      }
+
+      for (LiveInterval::SubRange &S : LI.subranges()) {
+        // A Mask for subregs common to the existing subrange and current def.
+        unsigned Common = S.LaneMask & Mask;
+        if (Common == 0)
+          continue;
+        // A Mask for subregs covered by the subrange but not the current def.
+        unsigned LRest = S.LaneMask & ~Mask;
+        LiveInterval::SubRange *CommonRange;
+        if (LRest != 0) {
+          // Split current subrange into Common and LRest ranges.
+          S.LaneMask = LRest;
+          CommonRange = LI.createSubRangeFrom(*Alloc, Common, S);
+        } else {
+          assert(Common == S.LaneMask);
+          CommonRange = &S;
+        }
+        if (MO.isDef())
+          createDeadDef(*Indexes, *Alloc, *CommonRange, MO);
+        Mask &= ~Common;
+      }
+      // Create a new SubRange for subregs we did not cover yet.
+      if (Mask != 0) {
+        LiveInterval::SubRange *NewRange = LI.createSubRange(*Alloc, Mask);
+        if (MO.isDef())
+          createDeadDef(*Indexes, *Alloc, *NewRange, MO);
+      }
+    }
+
+    // Create the def in the main liverange. We do not have to do this if
+    // subranges are tracked as we recreate the main range later in this case.
+    if (MO.isDef() && !LI.hasSubRanges())
+      createDeadDef(*Indexes, *Alloc, LI, MO);
+  }
+
+  // We may have created empty live ranges for partially undefined uses, we
+  // can't keep them because we won't find defs in them later.
+  LI.removeEmptySubRanges();
+
+  // Step 2: Extend live segments to all uses, constructing SSA form as
+  // necessary.
+  if (LI.hasSubRanges()) {
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      resetLiveOutMap();
+      extendToUses(S, Reg, S.LaneMask);
+    }
+    LI.clear();
+    LI.constructMainRangeFromSubranges(*Indexes, *Alloc);
+  } else {
+    resetLiveOutMap();
+    extendToUses(LI, Reg, ~0u);
+  }
+}
+
+
 void LiveRangeCalc::createDeadDefs(LiveRange &LR, unsigned Reg) {
   assert(MRI && Indexes && "call reset() first");
 
   // Visit all def operands. If the same instruction has multiple defs of Reg,
   // LR.createDeadDef() will deduplicate.
-  for (MachineOperand &MO : MRI->def_operands(Reg)) {
-    const MachineInstr *MI = MO.getParent();
-    // Find the corresponding slot index.
-    SlotIndex Idx;
-    if (MI->isPHI())
-      // PHI defs begin at the basic block start index.
-      Idx = Indexes->getMBBStartIdx(MI->getParent());
-    else
-      // Instructions are either normal 'r', or early clobber 'e'.
-      Idx = Indexes->getInstructionIndex(MI)
-        .getRegSlot(MO.isEarlyClobber());
-
-    // Create the def in LR. This may find an existing def.
-    LR.createDeadDef(Idx, *Alloc);
-  }
+  for (MachineOperand &MO : MRI->def_operands(Reg))
+    createDeadDef(*Indexes, *Alloc, LR, MO);
 }
 
 
-void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg) {
-  assert(MRI && Indexes && "call reset() first");
-
+void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg, unsigned Mask) {
   // Visit all operands that read Reg. This may include partial defs.
+  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
   for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
     // Clear all kill flags. They will be reinserted after register allocation
     // by LiveIntervalAnalysis::addKillFlags().
     if (MO.isUse())
       MO.setIsKill(false);
+    else {
+      // We only care about uses, but on the main range (mask ~0u) this includes
+      // the "virtual" reads happening for subregister defs.
+      if (Mask != ~0u)
+        continue;
+    }
+
     if (!MO.readsReg())
       continue;
-    // MI is reading Reg. We may have visited MI before if it happens to be
-    // reading Reg multiple times. That is OK, extend() is idempotent.
+    unsigned SubReg = MO.getSubReg();
+    if (SubReg != 0) {
+      unsigned SubRegMask = TRI.getSubRegIndexLaneMask(SubReg);
+      // Ignore uses not covering the current subrange.
+      if ((SubRegMask & Mask) == 0)
+        continue;
+    }
+
+    // Determine the actual place of the use.
     const MachineInstr *MI = MO.getParent();
     unsigned OpNo = (&MO - &MI->getOperand(0));
-
-    // Find the SlotIndex being read.
-    SlotIndex Idx;
+    SlotIndex UseIdx;
     if (MI->isPHI()) {
       assert(!MO.isDef() && "Cannot handle PHI def of partial register.");
-      // PHI operands are paired: (Reg, PredMBB).
-      // Extend the live range to be live-out from PredMBB.
-      Idx = Indexes->getMBBEndIdx(MI->getOperand(OpNo+1).getMBB());
+      // The actual place where a phi operand is used is the end of the pred
+      // MBB. PHI operands are paired: (Reg, PredMBB).
+      UseIdx = Indexes->getMBBEndIdx(MI->getOperand(OpNo+1).getMBB());
     } else {
-      // This is a normal instruction.
-      Idx = Indexes->getInstructionIndex(MI).getRegSlot();
       // Check for early-clobber redefs.
+      bool isEarlyClobber = false;
       unsigned DefIdx;
-      if (MO.isDef()) {
-        if (MO.isEarlyClobber())
-          Idx = Idx.getRegSlot(true);
-      } else if (MI->isRegTiedToDefOperand(OpNo, &DefIdx)) {
+      if (MO.isDef())
+        isEarlyClobber = MO.isEarlyClobber();
+      else if (MI->isRegTiedToDefOperand(OpNo, &DefIdx)) {
         // FIXME: This would be a lot easier if tied early-clobber uses also
         // had an early-clobber flag.
-        if (MI->getOperand(DefIdx).isEarlyClobber())
-          Idx = Idx.getRegSlot(true);
+        isEarlyClobber = MI->getOperand(DefIdx).isEarlyClobber();
       }
+      UseIdx = Indexes->getInstructionIndex(MI).getRegSlot(isEarlyClobber);
     }
-    extend(LR, Idx, Reg);
+
+    // MI is reading Reg. We may have visited MI before if it happens to be
+    // reading Reg multiple times. That is OK, extend() is idempotent.
+    extend(LR, UseIdx, Reg);
   }
 }
 
 
-// Transfer information from the LiveIn vector to the live ranges.
-void LiveRangeCalc::updateLiveIns() {
+void LiveRangeCalc::updateFromLiveIns() {
   LiveRangeUpdater Updater;
-  for (SmallVectorImpl<LiveInBlock>::iterator I = LiveIn.begin(),
-         E = LiveIn.end(); I != E; ++I) {
-    if (!I->DomNode)
+  for (const LiveInBlock &I : LiveIn) {
+    if (!I.DomNode)
       continue;
-    MachineBasicBlock *MBB = I->DomNode->getBlock();
-    assert(I->Value && "No live-in value found");
+    MachineBasicBlock *MBB = I.DomNode->getBlock();
+    assert(I.Value && "No live-in value found");
     SlotIndex Start, End;
     std::tie(Start, End) = Indexes->getMBBRange(MBB);
 
-    if (I->Kill.isValid())
+    if (I.Kill.isValid())
       // Value is killed inside this block.
-      End = I->Kill;
+      End = I.Kill;
     else {
       // The value is live-through, update LiveOut as well.
       // Defer the Domtree lookup until it is needed.
       assert(Seen.test(MBB->getNumber()));
-      LiveOut[MBB] = LiveOutPair(I->Value, (MachineDomTreeNode *)nullptr);
+      Map[MBB] = LiveOutPair(I.Value, nullptr);
     }
-    Updater.setDest(&I->LR);
-    Updater.add(Start, End, I->Value);
+    Updater.setDest(&I.LR);
+    Updater.add(Start, End, I.Value);
   }
   LiveIn.clear();
 }
 
 
-void LiveRangeCalc::extend(LiveRange &LR, SlotIndex Kill, unsigned PhysReg) {
-  assert(Kill.isValid() && "Invalid SlotIndex");
+void LiveRangeCalc::extend(LiveRange &LR, SlotIndex Use, unsigned PhysReg) {
+  assert(Use.isValid() && "Invalid SlotIndex");
   assert(Indexes && "Missing SlotIndexes");
   assert(DomTree && "Missing dominator tree");
 
-  MachineBasicBlock *KillMBB = Indexes->getMBBFromIndex(Kill.getPrevSlot());
-  assert(KillMBB && "No MBB at Kill");
+  MachineBasicBlock *UseMBB = Indexes->getMBBFromIndex(Use.getPrevSlot());
+  assert(UseMBB && "No MBB at Use");
 
   // Is there a def in the same MBB we can extend?
-  if (LR.extendInBlock(Indexes->getMBBStartIdx(KillMBB), Kill))
+  if (LR.extendInBlock(Indexes->getMBBStartIdx(UseMBB), Use))
     return;
 
-  // Find the single reaching def, or determine if Kill is jointly dominated by
+  // Find the single reaching def, or determine if Use is jointly dominated by
   // multiple values, and we may need to create even more phi-defs to preserve
   // VNInfo SSA form.  Perform a search for all predecessor blocks where we
   // know the dominating VNInfo.
-  if (findReachingDefs(LR, *KillMBB, Kill, PhysReg))
+  if (findReachingDefs(LR, *UseMBB, Use, PhysReg))
     return;
 
   // When there were multiple different values, we may need new PHIs.
@@ -162,16 +250,16 @@
   assert(Indexes && "Missing SlotIndexes");
   assert(DomTree && "Missing dominator tree");
   updateSSA();
-  updateLiveIns();
+  updateFromLiveIns();
 }
 
 
-bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &KillMBB,
-                                     SlotIndex Kill, unsigned PhysReg) {
-  unsigned KillMBBNum = KillMBB.getNumber();
+bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
+                                     SlotIndex Use, unsigned PhysReg) {
+  unsigned UseMBBNum = UseMBB.getNumber();
 
   // Block numbers where LR should be live-in.
-  SmallVector<unsigned, 16> WorkList(1, KillMBBNum);
+  SmallVector<unsigned, 16> WorkList(1, UseMBBNum);
 
   // Remember if we have seen more than one value.
   bool UniqueVNI = true;
@@ -202,7 +290,7 @@
 
        // Is this a known live-out block?
        if (Seen.test(Pred->getNumber())) {
-         if (VNInfo *VNI = LiveOut[Pred].first) {
+         if (VNInfo *VNI = Map[Pred].first) {
            if (TheVNI && TheVNI != VNI)
              UniqueVNI = false;
            TheVNI = VNI;
@@ -225,11 +313,11 @@
        }
 
        // No, we need a live-in value for Pred as well
-       if (Pred != &KillMBB)
+       if (Pred != &UseMBB)
           WorkList.push_back(Pred->getNumber());
        else
-          // Loopback to KillMBB, so value is really live through.
-         Kill = SlotIndex();
+          // Loopback to UseMBB, so value is really live through.
+         Use = SlotIndex();
     }
   }
 
@@ -247,12 +335,11 @@
          E = WorkList.end(); I != E; ++I) {
        SlotIndex Start, End;
        std::tie(Start, End) = Indexes->getMBBRange(*I);
-       // Trim the live range in KillMBB.
-       if (*I == KillMBBNum && Kill.isValid())
-         End = Kill;
+       // Trim the live range in UseMBB.
+       if (*I == UseMBBNum && Use.isValid())
+         End = Use;
        else
-         LiveOut[MF->getBlockNumbered(*I)] =
-           LiveOutPair(TheVNI, nullptr);
+         Map[MF->getBlockNumbered(*I)] = LiveOutPair(TheVNI, nullptr);
        Updater.add(Start, End, TheVNI);
     }
     return true;
@@ -265,8 +352,8 @@
        I = WorkList.begin(), E = WorkList.end(); I != E; ++I) {
     MachineBasicBlock *MBB = MF->getBlockNumbered(*I);
     addLiveInBlock(LR, DomTree->getNode(MBB));
-    if (MBB == &KillMBB)
-      LiveIn.back().Kill = Kill;
+    if (MBB == &UseMBB)
+      LiveIn.back().Kill = Use;
   }
 
   return false;
@@ -285,9 +372,8 @@
     Changes = 0;
     // Propagate live-out values down the dominator tree, inserting phi-defs
     // when necessary.
-    for (SmallVectorImpl<LiveInBlock>::iterator I = LiveIn.begin(),
-           E = LiveIn.end(); I != E; ++I) {
-      MachineDomTreeNode *Node = I->DomNode;
+    for (LiveInBlock &I : LiveIn) {
+      MachineDomTreeNode *Node = I.DomNode;
       // Skip block if the live-in value has already been determined.
       if (!Node)
         continue;
@@ -303,16 +389,16 @@
       // immediate dominator. Check if any of them have live-out values that are
       // properly dominated by IDom. If so, we need a phi-def here.
       if (!needPHI) {
-        IDomValue = LiveOut[IDom->getBlock()];
+        IDomValue = Map[IDom->getBlock()];
 
         // Cache the DomTree node that defined the value.
         if (IDomValue.first && !IDomValue.second)
-          LiveOut[IDom->getBlock()].second = IDomValue.second =
+          Map[IDom->getBlock()].second = IDomValue.second =
             DomTree->getNode(Indexes->getMBBFromIndex(IDomValue.first->def));
 
         for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
                PE = MBB->pred_end(); PI != PE; ++PI) {
-          LiveOutPair &Value = LiveOut[*PI];
+          LiveOutPair &Value = Map[*PI];
           if (!Value.first || Value.first == IDomValue.first)
             continue;
 
@@ -334,7 +420,7 @@
       // The value may be live-through even if Kill is set, as can happen when
       // we are called from extendRange. In that case LiveOutSeen is true, and
       // LiveOut indicates a foreign or missing value.
-      LiveOutPair &LOP = LiveOut[MBB];
+      LiveOutPair &LOP = Map[MBB];
 
       // Create a phi-def if required.
       if (needPHI) {
@@ -342,25 +428,25 @@
         assert(Alloc && "Need VNInfo allocator to create PHI-defs");
         SlotIndex Start, End;
         std::tie(Start, End) = Indexes->getMBBRange(MBB);
-        LiveRange &LR = I->LR;
+        LiveRange &LR = I.LR;
         VNInfo *VNI = LR.getNextValue(Start, *Alloc);
-        I->Value = VNI;
+        I.Value = VNI;
         // This block is done, we know the final value.
-        I->DomNode = nullptr;
+        I.DomNode = nullptr;
 
-        // Add liveness since updateLiveIns now skips this node.
-        if (I->Kill.isValid())
-          LR.addSegment(LiveInterval::Segment(Start, I->Kill, VNI));
+        // Add liveness since updateFromLiveIns now skips this node.
+        if (I.Kill.isValid())
+          LR.addSegment(LiveInterval::Segment(Start, I.Kill, VNI));
         else {
           LR.addSegment(LiveInterval::Segment(Start, End, VNI));
           LOP = LiveOutPair(VNI, Node);
         }
       } else if (IDomValue.first) {
         // No phi-def here. Remember incoming value.
-        I->Value = IDomValue.first;
+        I.Value = IDomValue.first;
 
         // If the IDomValue is killed in the block, don't propagate through.
-        if (I->Kill.isValid())
+        if (I.Kill.isValid())
           continue;
 
         // Propagate IDomValue if it isn't killed:

diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h
index 345d6c4..90bf971 100644
--- a/lib/CodeGen/LiveRangeCalc.h
+++ b/lib/CodeGen/LiveRangeCalc.h

@@ -40,12 +40,6 @@
   MachineDominatorTree *DomTree;
   VNInfo::Allocator *Alloc;
 
-  /// Seen - Bit vector of active entries in LiveOut, also used as a visited
-  /// set by findReachingDefs.  One entry per basic block, indexed by block
-  /// number.  This is kept as a separate bit vector because it can be cleared
-  /// quickly when switching live ranges.
-  BitVector Seen;
-
   /// LiveOutPair - A value and the block that defined it.  The domtree node is
   /// redundant, it can be computed as: MDT[Indexes.getMBBFromIndex(VNI->def)].
   typedef std::pair<VNInfo*, MachineDomTreeNode*> LiveOutPair;
@@ -53,8 +47,14 @@
   /// LiveOutMap - Map basic blocks to the value leaving the block.
   typedef IndexedMap<LiveOutPair, MBB2NumberFunctor> LiveOutMap;
 
-  /// LiveOut - Map each basic block where a live range is live out to the
-  /// live-out value and its defining block.
+  /// Bit vector of active entries in LiveOut, also used as a visited set by
+  /// findReachingDefs.  One entry per basic block, indexed by block number.
+  /// This is kept as a separate bit vector because it can be cleared quickly
+  /// when switching live ranges.
+  BitVector Seen;
+
+  /// Map each basic block where a live range is live out to the live-out value
+  /// and its defining block.
   ///
   /// For every basic block, MBB, one of these conditions shall be true:
   ///
@@ -70,7 +70,7 @@
   ///
   /// The map can be shared by multiple live ranges as long as no two are
   /// live-out of the same block.
-  LiveOutMap LiveOut;
+  LiveOutMap Map;
 
   /// LiveInBlock - Information about a basic block where a live range is known
   /// to be live-in, but the value has not yet been determined.
@@ -101,17 +101,17 @@
   /// used to add entries directly.
   SmallVector<LiveInBlock, 16> LiveIn;
 
-  /// Assuming that LI is live-in to KillMBB and killed at Kill, find the set
-  /// of defs that can reach it.
+  /// Assuming that @p LR is live-in to @p UseMBB, find the set of defs that can
+  /// reach it.
   ///
-  /// If only one def can reach Kill, all paths from the def to kill are added
-  /// to LI, and the function returns true.
+  /// If only one def can reach @p UseMBB, all paths from the def to @p UseMBB
+  /// are added to @p LR, and the function returns true.
   ///
-  /// If multiple values can reach Kill, the blocks that need LI to be live in
-  /// are added to the LiveIn array, and the function returns false.
+  /// If multiple values can reach @p UseMBB, the blocks that need @p LR to be
+  /// live in are added to the LiveIn array, and the function returns false.
   ///
   /// PhysReg, when set, is used to verify live-in lists on basic blocks.
-  bool findReachingDefs(LiveRange &LR, MachineBasicBlock &KillMBB,
+  bool findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
                         SlotIndex Kill, unsigned PhysReg);
 
   /// updateSSA - Compute the values that will be live in to all requested
@@ -121,8 +121,18 @@
   /// blocks.  No values are read from the live ranges.
   void updateSSA();
 
-  /// Add liveness as specified in the LiveIn vector.
-  void updateLiveIns();
+  /// Transfer information from the LiveIn vector to the live ranges and update
+  /// the given @p LiveOuts.
+  void updateFromLiveIns();
+
+  /// Extend the live range of @p LR to reach all uses of Reg.
+  ///
+  /// All uses must be jointly dominated by existing liveness.  PHI-defs are
+  /// inserted as needed to preserve SSA form.
+  void extendToUses(LiveRange &LR, unsigned Reg, unsigned LaneMask);
+
+  /// Reset Map and Seen fields.
+  void resetLiveOutMap();
 
 public:
   LiveRangeCalc() : MF(nullptr), MRI(nullptr), Indexes(nullptr),
@@ -152,37 +162,33 @@
   // Modify existing live ranges.
   //
 
-  /// extend - Extend the live range of LI to reach Kill.
+  /// Extend the live range of @p LR to reach @p Use.
   ///
-  /// The existing values in LI must be live so they jointly dominate Kill.  If
-  /// Kill is not dominated by a single existing value, PHI-defs are inserted
-  /// as required to preserve SSA form.  If Kill is known to be dominated by a
-  /// single existing value, Alloc may be null.
+  /// The existing values in @p LR must be live so they jointly dominate @p Use.
+  /// If @p Use is not dominated by a single existing value, PHI-defs are
+  /// inserted as required to preserve SSA form.
   ///
   /// PhysReg, when set, is used to verify live-in lists on basic blocks.
-  void extend(LiveRange &LR, SlotIndex Kill, unsigned PhysReg = 0);
+  void extend(LiveRange &LR, SlotIndex Use, unsigned PhysReg = 0);
 
   /// createDeadDefs - Create a dead def in LI for every def operand of Reg.
   /// Each instruction defining Reg gets a new VNInfo with a corresponding
   /// minimal live range.
   void createDeadDefs(LiveRange &LR, unsigned Reg);
 
-  /// createDeadDefs - Create a dead def in LI for every def of LI->reg.
-  void createDeadDefs(LiveInterval &LI) {
-    createDeadDefs(LI, LI.reg);
-  }
-
-  /// extendToUses - Extend the live range of LI to reach all uses of Reg.
+  /// Extend the live range of @p LR to reach all uses of Reg.
   ///
   /// All uses must be jointly dominated by existing liveness.  PHI-defs are
   /// inserted as needed to preserve SSA form.
-  void extendToUses(LiveRange &LR, unsigned Reg);
-
-  /// extendToUses - Extend the live range of LI to reach all uses of LI->reg.
-  void extendToUses(LiveInterval &LI) {
-    extendToUses(LI, LI.reg);
+  void extendToUses(LiveRange &LR, unsigned PhysReg) {
+    extendToUses(LR, PhysReg, ~0u);
   }
 
+  /// Calculates liveness for the register specified in live interval @p LI.
+  /// Creates subregister live ranges as needed if subreg liveness tracking is
+  /// enabled.
+  void calculate(LiveInterval &LI);
+
   //===--------------------------------------------------------------------===//
   // Low-level interface.
   //===--------------------------------------------------------------------===//
@@ -204,7 +210,7 @@
   /// addLiveInBlock().
   void setLiveOutValue(MachineBasicBlock *MBB, VNInfo *VNI) {
     Seen.set(MBB->getNumber());
-    LiveOut[MBB] = LiveOutPair(VNI, nullptr);
+    Map[MBB] = LiveOutPair(VNI, nullptr);
   }
 
   /// addLiveInBlock - Add a block with an unknown live-in value.  This

diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index a0fb712..0edc897 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp

@@ -60,9 +60,7 @@
 }
 
 void LiveRangeEdit::scanRemattable(AliasAnalysis *aa) {
-  for (LiveInterval::vni_iterator I = getParent().vni_begin(),
-       E = getParent().vni_end(); I != E; ++I) {
-    VNInfo *VNI = *I;
+  for (VNInfo *VNI : getParent().valnos) {
     if (VNI->isUnused())
       continue;
     MachineInstr *DefMI = LIS.getInstructionFromIndex(VNI->def);
@@ -258,15 +256,8 @@
       // Check if MI reads any unreserved physregs.
       if (Reg && MOI->readsReg() && !MRI.isReserved(Reg))
         ReadsPhysRegs = true;
-      else if (MOI->isDef()) {
-        for (MCRegUnitIterator Units(Reg, MRI.getTargetRegisterInfo());
-             Units.isValid(); ++Units) {
-          if (LiveRange *LR = LIS.getCachedRegUnit(*Units)) {
-            if (VNInfo *VNI = LR->getVNInfoAt(Idx))
-              LR->removeValNo(VNI);
-          }
-        }
-      }
+      else if (MOI->isDef())
+        LIS.removePhysRegDefAt(Reg, Idx);
       continue;
     }
     LiveInterval &LI = LIS.getInterval(Reg);
@@ -282,13 +273,11 @@
 
     // Remove defined value.
     if (MOI->isDef()) {
-      if (VNInfo *VNI = LI.getVNInfoAt(Idx)) {
-        if (TheDelegate)
-          TheDelegate->LRE_WillShrinkVirtReg(LI.reg);
-        LI.removeValNo(VNI);
-        if (LI.empty())
-          RegsToErase.push_back(Reg);
-      }
+      if (TheDelegate && LI.getVNInfoAt(Idx) != nullptr)
+        TheDelegate->LRE_WillShrinkVirtReg(LI.reg);
+      LIS.removeVRegDefAt(LI, Idx);
+      if (LI.empty())
+        RegsToErase.push_back(Reg);
     }
   }
 
@@ -410,7 +399,7 @@
   VirtRegAuxInfo VRAI(MF, LIS, Loops, MBFI);
   for (unsigned I = 0, Size = size(); I < Size; ++I) {
     LiveInterval &LI = LIS.getInterval(get(I));
-    if (MRI.recomputeRegClass(LI.reg, MF.getTarget()))
+    if (MRI.recomputeRegClass(LI.reg))
       DEBUG({
         const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
         dbgs() << "Inflated " << PrintReg(LI.reg) << " to "

diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp
index a8cae08..154ce6f 100644
--- a/lib/CodeGen/LiveRegMatrix.cpp
+++ b/lib/CodeGen/LiveRegMatrix.cpp

@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
@@ -71,16 +72,44 @@
   }
 }
 
+template<typename Callable>
+bool foreachUnit(const TargetRegisterInfo *TRI, LiveInterval &VRegInterval,
+                 unsigned PhysReg, Callable Func) {
+  if (VRegInterval.hasSubRanges()) {
+    for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+      unsigned Unit = (*Units).first;
+      unsigned Mask = (*Units).second;
+      for (LiveInterval::SubRange &S : VRegInterval.subranges()) {
+        if (S.LaneMask & Mask) {
+          if (Func(Unit, S))
+            return true;
+          break;
+        }
+      }
+    }
+  } else {
+    for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+      if (Func(*Units, VRegInterval))
+        return true;
+    }
+  }
+  return false;
+}
+
 void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) {
   DEBUG(dbgs() << "assigning " << PrintReg(VirtReg.reg, TRI)
                << " to " << PrintReg(PhysReg, TRI) << ':');
   assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment");
   VRM->assignVirt2Phys(VirtReg.reg, PhysReg);
   MRI->setPhysRegUsed(PhysReg);
-  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
-    DEBUG(dbgs() << ' ' << PrintRegUnit(*Units, TRI));
-    Matrix[*Units].unify(VirtReg);
-  }
+
+  foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
+                                         const LiveRange &Range) {
+    DEBUG(dbgs() << ' ' << PrintRegUnit(Unit, TRI) << ' ' << Range);
+    Matrix[Unit].unify(VirtReg, Range);
+    return false;
+  });
+
   ++NumAssigned;
   DEBUG(dbgs() << '\n');
 }
@@ -90,10 +119,14 @@
   DEBUG(dbgs() << "unassigning " << PrintReg(VirtReg.reg, TRI)
                << " from " << PrintReg(PhysReg, TRI) << ':');
   VRM->clearVirt(VirtReg.reg);
-  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
-    DEBUG(dbgs() << ' ' << PrintRegUnit(*Units, TRI));
-    Matrix[*Units].extract(VirtReg);
-  }
+
+  foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
+                                         const LiveRange &Range) {
+    DEBUG(dbgs() << ' ' << PrintRegUnit(Unit, TRI));
+    Matrix[Unit].extract(VirtReg, Range);
+    return false;
+  });
+
   ++NumUnassigned;
   DEBUG(dbgs() << '\n');
 }
@@ -121,12 +154,13 @@
   if (VirtReg.empty())
     return false;
   CoalescerPair CP(VirtReg.reg, PhysReg, *TRI);
-  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
-    const LiveRange &UnitRange = LIS->getRegUnit(*Units);
-    if (VirtReg.overlaps(UnitRange, CP, *LIS->getSlotIndexes()))
-      return true;
-  }
-  return false;
+
+  bool Result = foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
+                                                       const LiveRange &Range) {
+    const LiveRange &UnitRange = LIS->getRegUnit(Unit);
+    return Range.overlaps(UnitRange, CP, *LIS->getSlotIndexes());
+  });
+  return Result;
 }
 
 LiveIntervalUnion::Query &LiveRegMatrix::query(LiveInterval &VirtReg,

diff --git a/lib/CodeGen/LocalStackSlotAllocation.cpp b/lib/CodeGen/LocalStackSlotAllocation.cpp
index 5c5712f..e8bf687 100644
--- a/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/lib/CodeGen/LocalStackSlotAllocation.cpp

@@ -291,6 +291,7 @@
       // Debug value, stackmap and patchpoint instructions can't be out of
       // range, so they don't need any updates.
       if (MI->isDebugValue() ||
+          MI->getOpcode() == TargetOpcode::STATEPOINT ||
           MI->getOpcode() == TargetOpcode::STACKMAP ||
           MI->getOpcode() == TargetOpcode::PATCHPOINT)
         continue;

diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 3058b1a..3c73905 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp

@@ -24,7 +24,6 @@
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/LeakDetector.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/Support/Debug.h"
@@ -45,7 +44,6 @@
 }
 
 MachineBasicBlock::~MachineBasicBlock() {
-  LeakDetector::removeGarbageObject(this);
 }
 
 /// getSymbol - Return the MCSymbol for this basic block.
@@ -54,9 +52,7 @@
   if (!CachedMCSymbol) {
     const MachineFunction *MF = getParent();
     MCContext &Ctx = MF->getContext();
-    const TargetMachine &TM = MF->getTarget();
-    const char *Prefix =
-        TM.getSubtargetImpl()->getDataLayout()->getPrivateGlobalPrefix();
+    const char *Prefix = Ctx.getAsmInfo()->getPrivateLabelPrefix();
     CachedMCSymbol = Ctx.GetOrCreateSymbol(Twine(Prefix) + "BB" +
                                            Twine(MF->getFunctionNumber()) +
                                            "_" + Twine(getNumber()));
@@ -87,14 +83,11 @@
   for (MachineBasicBlock::instr_iterator
          I = N->instr_begin(), E = N->instr_end(); I != E; ++I)
     I->AddRegOperandsToUseLists(RegInfo);
-
-  LeakDetector::removeGarbageObject(N);
 }
 
 void ilist_traits<MachineBasicBlock>::removeNodeFromList(MachineBasicBlock *N) {
   N->getParent()->removeFromMBBNumbering(N->Number);
   N->Number = -1;
-  LeakDetector::addGarbageObject(N);
 }
 
 
@@ -109,8 +102,6 @@
   // use/def lists.
   MachineFunction *MF = Parent->getParent();
   N->AddRegOperandsToUseLists(MF->getRegInfo());
-
-  LeakDetector::removeGarbageObject(N);
 }
 
 /// removeNodeFromList (MI) - When we remove an instruction from a basic block
@@ -124,8 +115,6 @@
     N->RemoveRegOperandsFromUseLists(MF->getRegInfo());
 
   N->setParent(nullptr);
-
-  LeakDetector::addGarbageObject(N);
 }
 
 /// transferNodesFromList (MI) - When moving a range of instructions from one

diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 08fd200..1b5c1f1 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp

@@ -347,65 +347,61 @@
   uint32_t WeightScale = 0;
   uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale);
   DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n");
-  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
-                                        SE = BB->succ_end();
-       SI != SE; ++SI) {
-    if (BlockFilter && !BlockFilter->count(*SI))
+  for (MachineBasicBlock *Succ : BB->successors()) {
+    if (BlockFilter && !BlockFilter->count(Succ))
       continue;
-    BlockChain &SuccChain = *BlockToChain[*SI];
+    BlockChain &SuccChain = *BlockToChain[Succ];
     if (&SuccChain == &Chain) {
-      DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> Already merged!\n");
+      DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> Already merged!\n");
       continue;
     }
-    if (*SI != *SuccChain.begin()) {
-      DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> Mid chain!\n");
+    if (Succ != *SuccChain.begin()) {
+      DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> Mid chain!\n");
       continue;
     }
 
-    uint32_t SuccWeight = MBPI->getEdgeWeight(BB, *SI);
+    uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ);
     BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);
 
     // Only consider successors which are either "hot", or wouldn't violate
     // any CFG constraints.
     if (SuccChain.LoopPredecessors != 0) {
       if (SuccProb < HotProb) {
-        DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> " << SuccProb
+        DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> " << SuccProb
                      << " (prob) (CFG conflict)\n");
         continue;
       }
 
-      // Make sure that a hot successor doesn't have a globally more important
-      // predecessor.
-      BlockFrequency CandidateEdgeFreq
-        = MBFI->getBlockFreq(BB) * SuccProb * HotProb.getCompl();
+      // Make sure that a hot successor doesn't have a globally more
+      // important predecessor.
+      BlockFrequency CandidateEdgeFreq =
+          MBFI->getBlockFreq(BB) * SuccProb * HotProb.getCompl();
       bool BadCFGConflict = false;
-      for (MachineBasicBlock::pred_iterator PI = (*SI)->pred_begin(),
-                                            PE = (*SI)->pred_end();
-           PI != PE; ++PI) {
-        if (*PI == *SI || (BlockFilter && !BlockFilter->count(*PI)) ||
-            BlockToChain[*PI] == &Chain)
+      for (MachineBasicBlock *Pred : Succ->predecessors()) {
+        if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) ||
+            BlockToChain[Pred] == &Chain)
           continue;
-        BlockFrequency PredEdgeFreq
-          = MBFI->getBlockFreq(*PI) * MBPI->getEdgeProbability(*PI, *SI);
+        BlockFrequency PredEdgeFreq =
+            MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
         if (PredEdgeFreq >= CandidateEdgeFreq) {
           BadCFGConflict = true;
           break;
         }
       }
       if (BadCFGConflict) {
-        DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> " << SuccProb
+        DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> " << SuccProb
                      << " (prob) (non-cold CFG conflict)\n");
         continue;
       }
     }
 
-    DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> " << SuccProb
+    DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> " << SuccProb
                  << " (prob)"
                  << (SuccChain.LoopPredecessors != 0 ? " (CFG break)" : "")
                  << "\n");
     if (BestSucc && BestWeight >= SuccWeight)
       continue;
-    BestSucc = *SI;
+    BestSucc = Succ;
     BestWeight = SuccWeight;
   }
   return BestSucc;
@@ -1043,12 +1039,8 @@
   // exclusively on the loop info here so that we can align backedges in
   // unnatural CFGs and backedges that were introduced purely because of the
   // loop rotations done during this layout pass.
-  if (F.getFunction()->getAttributes().
-        hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize))
+  if (F.getFunction()->hasFnAttribute(Attribute::OptimizeForSize))
     return;
-  unsigned Align = TLI->getPrefLoopAlignment();
-  if (!Align)
-    return;  // Don't care about loop alignment.
   if (FunctionChain.begin() == FunctionChain.end())
     return;  // Empty chain.
 
@@ -1066,6 +1058,10 @@
     if (!L)
       continue;
 
+    unsigned Align = TLI->getPrefLoopAlignment(L);
+    if (!Align)
+      continue;  // Don't care about loop alignment.
+
     // If the block is cold relative to the function entry don't waste space
     // aligning it.
     BlockFrequency Freq = MBFI->getBlockFreq(*BI);

diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index ae26967..21b9c5a 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp

@@ -451,6 +451,7 @@
 
   SmallVector<std::pair<unsigned, unsigned>, 8> CSEPairs;
   SmallVector<unsigned, 2> ImplicitDefsToUpdate;
+  SmallVector<unsigned, 2> ImplicitDefs;
   for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ) {
     MachineInstr *MI = &*I;
     ++I;
@@ -542,6 +543,12 @@
       // we should make sure it is not dead at CSMI.
       if (MO.isImplicit() && !MO.isDead() && CSMI->getOperand(i).isDead())
         ImplicitDefsToUpdate.push_back(i);
+
+      // Keep track of implicit defs of CSMI and MI, to clear possibly
+      // made-redundant kill flags.
+      if (MO.isImplicit() && !MO.isDead() && OldReg == NewReg)
+        ImplicitDefs.push_back(OldReg);
+
       if (OldReg == NewReg) {
         --NumDefs;
         continue;
@@ -573,8 +580,15 @@
     // Actually perform the elimination.
     if (DoCSE) {
       for (unsigned i = 0, e = CSEPairs.size(); i != e; ++i) {
-        MRI->replaceRegWith(CSEPairs[i].first, CSEPairs[i].second);
-        MRI->clearKillFlags(CSEPairs[i].second);
+        unsigned OldReg = CSEPairs[i].first;
+        unsigned NewReg = CSEPairs[i].second;
+        // OldReg may have been unused but is used now, clear the Dead flag
+        MachineInstr *Def = MRI->getUniqueVRegDef(NewReg);
+        assert(Def != nullptr && "CSEd register has no unique definition?");
+        Def->clearRegisterDeads(NewReg);
+        // Replace with NewReg and clear kill flags which may be wrong now.
+        MRI->replaceRegWith(OldReg, NewReg);
+        MRI->clearKillFlags(NewReg);
       }
 
       // Go through implicit defs of CSMI and MI, if a def is not dead at MI,
@@ -582,6 +596,29 @@
       for (unsigned i = 0, e = ImplicitDefsToUpdate.size(); i != e; ++i)
         CSMI->getOperand(ImplicitDefsToUpdate[i]).setIsDead(false);
 
+      // Go through implicit defs of CSMI and MI, and clear the kill flags on
+      // their uses in all the instructions between CSMI and MI.
+      // We might have made some of the kill flags redundant, consider:
+      //   subs  ... %NZCV<imp-def>        <- CSMI
+      //   csinc ... %NZCV<imp-use,kill>   <- this kill flag isn't valid anymore
+      //   subs  ... %NZCV<imp-def>        <- MI, to be eliminated
+      //   csinc ... %NZCV<imp-use,kill>
+      // Since we eliminated MI, and reused a register imp-def'd by CSMI
+      // (here %NZCV), that register, if it was killed before MI, should have
+      // that kill flag removed, because it's lifetime was extended.
+      if (CSMI->getParent() == MI->getParent()) {
+        for (MachineBasicBlock::iterator II = CSMI, IE = MI; II != IE; ++II)
+          for (auto ImplicitDef : ImplicitDefs)
+            if (MachineOperand *MO = II->findRegisterUseOperand(
+                    ImplicitDef, /*isKill=*/true, TRI))
+              MO->setIsKill(false);
+      } else {
+        // If the instructions aren't in the same BB, bail out and clear the
+        // kill flag on all uses of the imp-def'd register.
+        for (auto ImplicitDef : ImplicitDefs)
+          MRI->clearKillFlags(ImplicitDef);
+      }
+
       if (CrossMBBPhysDef) {
         // Add physical register defs now coming in from a predecessor to MBB
         // livein list.
@@ -606,6 +643,7 @@
     }
     CSEPairs.clear();
     ImplicitDefsToUpdate.clear();
+    ImplicitDefs.clear();
   }
 
   return Changed;

diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp
index 2931258..41045ac 100644
--- a/lib/CodeGen/MachineCombiner.cpp
+++ b/lib/CodeGen/MachineCombiner.cpp

@@ -45,7 +45,7 @@
 
   TargetSchedModel TSchedModel;
 
-  /// OptSize - True if optimizing for code size.
+  /// True if optimizing for code size.
   bool OptSize;
 
 public:
@@ -109,7 +109,7 @@
   return DefInstr;
 }
 
-/// getDepth - Computes depth of instructions in vector \InsInstr.
+/// Computes depth of instructions in vector \InsInstr.
 ///
 /// \param InsInstrs is a vector of machine instructions
 /// \param InstrIdxForVirtReg is a dense map of virtual register to index
@@ -125,7 +125,7 @@
   SmallVector<unsigned, 16> InstrDepth;
   assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n");
 
-  // Foreach instruction in in the new sequence compute the depth based on the
+  // For each instruction in the new sequence compute the depth based on the
   // operands. Use the trace information when possible. For new operands which
   // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
   for (auto *InstrPtr : InsInstrs) { // for each Use
@@ -169,8 +169,7 @@
   return InstrDepth[NewRootIdx];
 }
 
-/// getLatency - Computes instruction latency as max of latency of defined
-/// operands
+/// Computes instruction latency as max of latency of defined operands.
 ///
 /// \param Root is a machine instruction that could be replaced by NewRoot.
 /// It is used to compute a more accurate latency information for NewRoot in
@@ -211,12 +210,12 @@
   return NewRootLatency;
 }
 
-/// preservesCriticalPathlen - True when the new instruction sequence does not
+/// True when the new instruction sequence does not
 /// lengthen the critical path. The DAGCombine code sequence ends in MI
 /// (Machine Instruction) Root. The new code sequence ends in MI NewRoot. A
 /// necessary condition for the new sequence to replace the old sequence is that
-/// is cannot lengthen the critical path. This is decided by the formula
-/// (NewRootDepth + NewRootLatency) <=  (RootDepth + RootLatency + RootSlack)).
+/// it cannot lengthen the critical path. This is decided by the formula
+/// (NewRootDepth + NewRootLatency) <= (RootDepth + RootLatency + RootSlack)).
 /// The slack is the number of cycles Root can be delayed before the critical
 /// patch becomes longer.
 bool MachineCombiner::preservesCriticalPathLen(
@@ -264,8 +263,7 @@
     InstrsSC.push_back(SC);
   }
 }
-/// preservesResourceLen - True when the new instructions do not increase
-/// resource length
+/// True when the new instructions do not increase resource length
 bool MachineCombiner::preservesResourceLen(
     MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
     SmallVectorImpl<MachineInstr *> &InsInstrs,
@@ -300,7 +298,7 @@
 }
 
 /// \returns true when new instruction sequence should be generated
-/// independent if it lenghtens critical path or not
+/// independent if it lengthens critical path or not
 bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
   if (OptSize && (NewSize < OldSize))
     return true;
@@ -309,7 +307,7 @@
   return false;
 }
 
-/// combineInstructions - substitute a slow code sequence with a faster one by
+/// Substitute a slow code sequence with a faster one by
 /// evaluating instruction combining pattern.
 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
 /// combining based on machine trace metrics. Only combine a sequence of
@@ -406,8 +404,7 @@
 }
 
 bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
-  const TargetSubtargetInfo &STI =
-      MF.getTarget().getSubtarget<TargetSubtargetInfo>();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
   TII = STI.getInstrInfo();
   TRI = STI.getRegisterInfo();
   SchedModel = STI.getSchedModel();
@@ -416,8 +413,7 @@
   Traces = &getAnalysis<MachineTraceMetrics>();
   MinInstr = 0;
 
-  OptSize = MF.getFunction()->getAttributes().hasAttribute(
-      AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+  OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
 
   DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
   if (!TII->useMachineCombiner()) {

diff --git a/lib/CodeGen/MachineDominanceFrontier.cpp b/lib/CodeGen/MachineDominanceFrontier.cpp
index 0bee846..acb7c48 100644
--- a/lib/CodeGen/MachineDominanceFrontier.cpp
+++ b/lib/CodeGen/MachineDominanceFrontier.cpp

@@ -8,8 +8,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineDominanceFrontier.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/Analysis/DominanceFrontierImpl.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/Passes.h"
 
 

diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 8a2b610..151a260 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp

@@ -67,17 +67,14 @@
                        STI->getFrameLowering()->isStackRealignable(),
                        !F->hasFnAttribute("no-realign-stack"));
 
-  if (Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::StackAlignment))
-    FrameInfo->ensureMaxAlignment(Fn->getAttributes().
-                                getStackAlignment(AttributeSet::FunctionIndex));
+  if (Fn->hasFnAttribute(Attribute::StackAlignment))
+    FrameInfo->ensureMaxAlignment(Fn->getFnStackAlignment());
 
   ConstantPool = new (Allocator) MachineConstantPool(TM);
   Alignment = STI->getTargetLowering()->getMinFunctionAlignment();
 
   // FIXME: Shouldn't use pref alignment if explicit alignment is set on Fn.
-  if (!Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                        Attribute::OptimizeForSize))
+  if (!Fn->hasFnAttribute(Attribute::OptimizeForSize))
     Alignment = std::max(Alignment,
                          STI->getTargetLowering()->getPrefFunctionAlignment());
 
@@ -462,7 +459,7 @@
 /// normal 'L' label is returned.
 MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx,
                                         bool isLinkerPrivate) const {
-  const DataLayout *DL = getSubtarget().getDataLayout();
+  const DataLayout *DL = getTarget().getDataLayout();
   assert(JumpTableInfo && "No jump tables");
   assert(JTI < JumpTableInfo->getJumpTables().size() && "Invalid JTI!");
 
@@ -477,7 +474,7 @@
 /// getPICBaseSymbol - Return a function-local symbol to represent the PIC
 /// base.
 MCSymbol *MachineFunction::getPICBaseSymbol() const {
-  const DataLayout *DL = getSubtarget().getDataLayout();
+  const DataLayout *DL = getTarget().getDataLayout();
   return Ctx.GetOrCreateSymbol(Twine(DL->getPrivateGlobalPrefix())+
                                Twine(getFunctionNumber())+"$pb");
 }
@@ -587,13 +584,20 @@
   return -++NumFixedObjects;
 }
 
+int MachineFrameInfo::CreateFrameAllocation(uint64_t Size) {
+  // Force the use of a frame pointer. The intention is that this intrinsic be
+  // used in conjunction with unwind mechanisms that leak the frame pointer.
+  setFrameAddressIsTaken(true);
+  Size = RoundUpToAlignment(Size, StackAlignment);
+  return CreateStackObject(Size, StackAlignment, false);
+}
+
 BitVector
 MachineFrameInfo::getPristineRegs(const MachineBasicBlock *MBB) const {
   assert(MBB && "MBB must be valid");
   const MachineFunction *MF = MBB->getParent();
   assert(MF && "MBB must be part of a MachineFunction");
-  const TargetMachine &TM = MF->getTarget();
-  const TargetRegisterInfo *TRI = TM.getSubtargetImpl()->getRegisterInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   BitVector BV(TRI->getNumRegs());
 
   // Before CSI is calculated, no registers are considered pristine. They can be
@@ -813,7 +817,7 @@
 void MachineConstantPoolValue::anchor() { }
 
 const DataLayout *MachineConstantPool::getDataLayout() const {
-  return TM.getSubtargetImpl()->getDataLayout();
+  return TM.getDataLayout();
 }
 
 Type *MachineConstantPoolEntry::getType() const {
@@ -835,13 +839,13 @@
   switch (getRelocationInfo()) {
   default:
     llvm_unreachable("Unknown section kind");
-  case 2:
+  case Constant::GlobalRelocations:
     Kind = SectionKind::getReadOnlyWithRel();
     break;
-  case 1:
+  case Constant::LocalRelocation:
     Kind = SectionKind::getReadOnlyWithRelLocal();
     break;
-  case 0:
+  case Constant::NoRelocation:
     switch (DL->getTypeAllocSize(getType())) {
     case 4:
       Kind = SectionKind::getMergeableConst4();
@@ -853,7 +857,7 @@
       Kind = SectionKind::getMergeableConst16();
       break;
     default:
-      Kind = SectionKind::getMergeableConst();
+      Kind = SectionKind::getReadOnly();
       break;
     }
   }

diff --git a/lib/CodeGen/MachineFunctionPass.cpp b/lib/CodeGen/MachineFunctionPass.cpp
index 789f204..aaf06a7 100644
--- a/lib/CodeGen/MachineFunctionPass.cpp
+++ b/lib/CodeGen/MachineFunctionPass.cpp

@@ -11,11 +11,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Function.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/DominanceFrontier.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
 using namespace llvm;
 
 Pass *MachineFunctionPass::createPrinterPass(raw_ostream &O,
@@ -43,15 +50,13 @@
   // because CodeGen overloads that to mean preserving the MachineBasicBlock
   // CFG in addition to the LLVM IR CFG.
   AU.addPreserved<AliasAnalysis>();
-  AU.addPreserved("scalar-evolution");
-  AU.addPreserved("iv-users");
-  AU.addPreserved("memdep");
-  AU.addPreserved("live-values");
-  AU.addPreserved("domtree");
-  AU.addPreserved("domfrontier");
-  AU.addPreserved("loops");
-  AU.addPreserved("lda");
-  AU.addPreserved("stack-protector");
+  AU.addPreserved<DominanceFrontier>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addPreserved<IVUsers>();
+  AU.addPreserved<LoopInfoWrapperPass>();
+  AU.addPreserved<MemoryDependenceAnalysis>();
+  AU.addPreserved<ScalarEvolution>();
+  AU.addPreserved<StackProtector>();
 
   FunctionPass::getAnalysisUsage(AU);
 }

diff --git a/lib/CodeGen/MachineFunctionPrinterPass.cpp b/lib/CodeGen/MachineFunctionPrinterPass.cpp
index dee3977..790f5ac 100644
--- a/lib/CodeGen/MachineFunctionPrinterPass.cpp
+++ b/lib/CodeGen/MachineFunctionPrinterPass.cpp

@@ -52,7 +52,7 @@
 }
 
 char &llvm::MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID;
-INITIALIZE_PASS(MachineFunctionPrinterPass, "print-machineinstrs",
+INITIALIZE_PASS(MachineFunctionPrinterPass, "machineinstr-printer",
                 "Machine Function Printer", false, false)
 
 namespace llvm {

diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 7ad0d94..981e4a3 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp

@@ -397,7 +397,7 @@
     break;
   case MachineOperand::MO_Metadata:
     OS << '<';
-    getMetadata()->printAsOperand(OS, /*PrintType=*/false);
+    getMetadata()->printAsOperand(OS);
     OS << '>';
     break;
   case MachineOperand::MO_MCSymbol:
@@ -537,7 +537,7 @@
   if (const MDNode *TBAAInfo = MMO.getAAInfo().TBAA) {
     OS << "(tbaa=";
     if (TBAAInfo->getNumOperands() > 0)
-      TBAAInfo->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
+      TBAAInfo->getOperand(0)->printAsOperand(OS);
     else
       OS << "<unknown>";
     OS << ")";
@@ -548,7 +548,7 @@
     OS << "(alias.scope=";
     if (ScopeInfo->getNumOperands() > 0)
       for (unsigned i = 0, ie = ScopeInfo->getNumOperands(); i != ie; ++i) {
-        ScopeInfo->getOperand(i)->printAsOperand(OS, /*PrintType=*/false);
+        ScopeInfo->getOperand(i)->printAsOperand(OS);
         if (i != ie-1)
           OS << ",";
       }
@@ -562,7 +562,7 @@
     OS << "(noalias=";
     if (NoAliasInfo->getNumOperands() > 0)
       for (unsigned i = 0, ie = NoAliasInfo->getNumOperands(); i != ie; ++i) {
-        NoAliasInfo->getOperand(i)->printAsOperand(OS, /*PrintType=*/false);
+        NoAliasInfo->getOperand(i)->printAsOperand(OS);
         if (i != ie-1)
           OS << ",";
       }
@@ -595,10 +595,12 @@
 /// implicit operands. It reserves space for the number of operands specified by
 /// the MCInstrDesc.
 MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid,
-                           const DebugLoc dl, bool NoImp)
-  : MCID(&tid), Parent(nullptr), Operands(nullptr), NumOperands(0),
-    Flags(0), AsmPrinterFlags(0),
-    NumMemRefs(0), MemRefs(nullptr), debugLoc(dl) {
+                           DebugLoc dl, bool NoImp)
+    : MCID(&tid), Parent(nullptr), Operands(nullptr), NumOperands(0), Flags(0),
+      AsmPrinterFlags(0), NumMemRefs(0), MemRefs(nullptr),
+      debugLoc(std::move(dl)) {
+  assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
+
   // Reserve space for the expected number of operands.
   if (unsigned NumOps = MCID->getNumOperands() +
     MCID->getNumImplicitDefs() + MCID->getNumImplicitUses()) {
@@ -617,12 +619,14 @@
     Flags(0), AsmPrinterFlags(0),
     NumMemRefs(MI.NumMemRefs), MemRefs(MI.MemRefs),
     debugLoc(MI.getDebugLoc()) {
+  assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
+
   CapOperands = OperandCapacity::get(MI.getNumOperands());
   Operands = MF.allocateOperandArray(CapOperands);
 
   // Copy operands.
-  for (unsigned i = 0; i != MI.getNumOperands(); ++i)
-    addOperand(MF, MI.getOperand(i));
+  for (const MachineOperand &MO : MI.operands())
+    addOperand(MF, MO);
 
   // Copy all the sensible flags.
   setFlags(MI.Flags);
@@ -641,18 +645,18 @@
 /// this instruction from their respective use lists.  This requires that the
 /// operands already be on their use lists.
 void MachineInstr::RemoveRegOperandsFromUseLists(MachineRegisterInfo &MRI) {
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
-    if (Operands[i].isReg())
-      MRI.removeRegOperandFromUseList(&Operands[i]);
+  for (MachineOperand &MO : operands())
+    if (MO.isReg())
+      MRI.removeRegOperandFromUseList(&MO);
 }
 
 /// AddRegOperandsToUseLists - Add all of the register operands in
 /// this instruction from their respective use lists.  This requires that the
 /// operands not be on their use lists yet.
 void MachineInstr::AddRegOperandsToUseLists(MachineRegisterInfo &MRI) {
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
-    if (Operands[i].isReg())
-      MRI.addRegOperandToUseList(&Operands[i]);
+  for (MachineOperand &MO : operands())
+    if (MO.isReg())
+      MRI.addRegOperandToUseList(&MO);
 }
 
 void MachineInstr::addOperand(const MachineOperand &Op) {
@@ -670,14 +674,8 @@
   if (MRI)
     return MRI->moveOperands(Dst, Src, NumOps);
 
-  // Here it would be convenient to call memmove, so that isn't allowed because
-  // MachineOperand has a constructor and so isn't a POD type.
-  if (Dst < Src)
-    for (unsigned i = 0; i != NumOps; ++i)
-      new (Dst + i) MachineOperand(Src[i]);
-  else
-    for (unsigned i = NumOps; i ; --i)
-      new (Dst + i - 1) MachineOperand(Src[i - 1]);
+  // MachineOperand is a trivially copyable type so we can just use memmove.
+  std::memmove(Dst, Src, NumOps * sizeof(MachineOperand));
 }
 
 /// addOperand - Add the specified operand to the instruction.  If it is an
@@ -922,8 +920,7 @@
   MachineInstr *MI = (MachineInstr *)this;
   MachineRegisterInfo &MRI = MF->getRegInfo();
 
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || !MO.isDef())
       continue;
     unsigned Reg = MO.getReg();
@@ -1326,8 +1323,7 @@
 /// clearKillInfo - Clears kill flags on all operands.
 ///
 void MachineInstr::clearKillInfo() {
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = getOperand(i);
+  for (MachineOperand &MO : operands()) {
     if (MO.isReg() && MO.isUse())
       MO.setIsKill(false);
   }
@@ -1340,15 +1336,13 @@
   if (TargetRegisterInfo::isPhysicalRegister(ToReg)) {
     if (SubIdx)
       ToReg = RegInfo.getSubReg(ToReg, SubIdx);
-    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = getOperand(i);
+    for (MachineOperand &MO : operands()) {
       if (!MO.isReg() || MO.getReg() != FromReg)
         continue;
       MO.substPhysReg(ToReg, RegInfo);
     }
   } else {
-    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = getOperand(i);
+    for (MachineOperand &MO : operands()) {
       if (!MO.isReg() || MO.getReg() != FromReg)
         continue;
       MO.substVirtReg(ToReg, SubIdx, RegInfo);
@@ -1491,8 +1485,7 @@
 /// allDefsAreDead - Return true if all the defs of this instruction are dead.
 ///
 bool MachineInstr::allDefsAreDead() const {
-  for (unsigned i = 0, e = getNumOperands(); i < e; ++i) {
-    const MachineOperand &MO = getOperand(i);
+  for (const MachineOperand &MO : operands()) {
     if (!MO.isReg() || MO.isUse())
       continue;
     if (!MO.isDead())
@@ -1823,8 +1816,7 @@
                                       const TargetRegisterInfo *RegInfo) {
   if (!TargetRegisterInfo::isPhysicalRegister(Reg))
     RegInfo = nullptr;
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = getOperand(i);
+  for (MachineOperand &MO : operands()) {
     if (!MO.isReg() || !MO.isUse() || !MO.isKill())
       continue;
     unsigned OpReg = MO.getReg();
@@ -1885,6 +1877,22 @@
   return true;
 }
 
+void MachineInstr::clearRegisterDeads(unsigned Reg) {
+  for (MachineOperand &MO : operands()) {
+    if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg)
+      continue;
+    MO.setIsDead(false);
+  }
+}
+
+void MachineInstr::addRegisterDefReadUndef(unsigned Reg) {
+  for (MachineOperand &MO : operands()) {
+    if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg || MO.getSubReg() == 0)
+      continue;
+    MO.setIsUndef();
+  }
+}
+
 void MachineInstr::addRegisterDefined(unsigned Reg,
                                       const TargetRegisterInfo *RegInfo) {
   if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
@@ -1892,8 +1900,7 @@
     if (MO)
       return;
   } else {
-    for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-      const MachineOperand &MO = getOperand(i);
+    for (const MachineOperand &MO : operands()) {
       if (MO.isReg() && MO.getReg() == Reg && MO.isDef() &&
           MO.getSubReg() == 0)
         return;
@@ -1907,8 +1914,7 @@
 void MachineInstr::setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs,
                                          const TargetRegisterInfo &TRI) {
   bool HasRegMask = false;
-  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = getOperand(i);
+  for (MachineOperand &MO : operands()) {
     if (MO.isRegMask()) {
       HasRegMask = true;
       continue;
@@ -1916,15 +1922,10 @@
     if (!MO.isReg() || !MO.isDef()) continue;
     unsigned Reg = MO.getReg();
     if (!TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
-    bool Dead = true;
-    for (ArrayRef<unsigned>::iterator I = UsedRegs.begin(), E = UsedRegs.end();
-         I != E; ++I)
-      if (TRI.regsOverlap(*I, Reg)) {
-        Dead = false;
-        break;
-      }
     // If there are no uses, including partial uses, the def is dead.
-    if (Dead) MO.setIsDead();
+    if (std::none_of(UsedRegs.begin(), UsedRegs.end(),
+                     [&](unsigned Use) { return TRI.regsOverlap(Use, Reg); }))
+      MO.setIsDead();
   }
 
   // This is a call with a register mask operand.
@@ -1941,8 +1942,7 @@
   SmallVector<size_t, 8> HashComponents;
   HashComponents.reserve(MI->getNumOperands() + 1);
   HashComponents.push_back(MI->getOpcode());
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  for (const MachineOperand &MO : MI->operands()) {
     if (MO.isReg() && MO.isDef() &&
         TargetRegisterInfo::isVirtualRegister(MO.getReg()))
       continue;  // Skip virtual register defs.
@@ -1960,7 +1960,8 @@
     if (getOperand(i-1).isMetadata() &&
         (LocMD = getOperand(i-1).getMetadata()) &&
         LocMD->getNumOperands() != 0) {
-      if (const ConstantInt *CI = dyn_cast<ConstantInt>(LocMD->getOperand(0))) {
+      if (const ConstantInt *CI =
+              mdconst::dyn_extract<ConstantInt>(LocMD->getOperand(0))) {
         LocCookie = CI->getZExtValue();
         break;
       }

diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 2ab0467..64d0932 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp

@@ -49,6 +49,11 @@
                  cl::desc("MachineLICM should avoid speculation"),
                  cl::init(true), cl::Hidden);
 
+static cl::opt<bool>
+HoistCheapInsts("hoist-cheap-insts",
+                cl::desc("MachineLICM should hoist even cheap instructions"),
+                cl::init(false), cl::Hidden);
+
 STATISTIC(NumHoisted,
           "Number of machine instructions hoisted out of loops");
 STATISTIC(NumLowRP,
@@ -688,6 +693,10 @@
 /// one pass without iteration.
 ///
 void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
+  MachineBasicBlock *Preheader = getCurPreheader();
+  if (!Preheader)
+    return;
+
   SmallVector<MachineDomTreeNode*, 32> Scopes;
   SmallVector<MachineDomTreeNode*, 8> WorkList;
   DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> ParentMap;
@@ -695,7 +704,7 @@
 
   // Perform a DFS walk to determine the order of visit.
   WorkList.push_back(HeaderN);
-  do {
+  while (!WorkList.empty()) {
     MachineDomTreeNode *Node = WorkList.pop_back_val();
     assert(Node && "Null dominator tree node?");
     MachineBasicBlock *BB = Node->getBlock();
@@ -729,28 +738,21 @@
       ParentMap[Child] = Node;
       WorkList.push_back(Child);
     }
-  } while (!WorkList.empty());
-
-  if (Scopes.size() != 0) {
-    MachineBasicBlock *Preheader = getCurPreheader();
-    if (!Preheader)
-      return;
-
-    // Compute registers which are livein into the loop headers.
-    RegSeen.clear();
-    BackTrace.clear();
-    InitRegPressure(Preheader);
   }
 
+  if (Scopes.size() == 0)
+    return;
+
+  // Compute registers which are livein into the loop headers.
+  RegSeen.clear();
+  BackTrace.clear();
+  InitRegPressure(Preheader);
+
   // Now perform LICM.
   for (unsigned i = 0, e = Scopes.size(); i != e; ++i) {
     MachineDomTreeNode *Node = Scopes[i];
     MachineBasicBlock *MBB = Node->getBlock();
 
-    MachineBasicBlock *Preheader = getCurPreheader();
-    if (!Preheader)
-      continue;
-
     EnterScope(MBB);
 
     // Process the block
@@ -1075,7 +1077,7 @@
 
     // Don't hoist cheap instructions if they would increase register pressure,
     // even if we're under the limit.
-    if (CheapInstr)
+    if (CheapInstr && !HoistCheapInsts)
       return true;
 
     for (unsigned i = BackTrace.size(); i != 0; --i) {

diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index eb3c0bf..fca7df0 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp

@@ -9,6 +9,7 @@
 
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/ADT/PointerUnion.h"
+#include "llvm/Analysis/LibCallSemantics.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -273,9 +274,10 @@
   CurCallSite = 0;
   CallsEHReturn = 0;
   CallsUnwindInit = 0;
-  DbgInfoAvailable = UsesVAFloatArgument = false; 
+  DbgInfoAvailable = UsesVAFloatArgument = UsesMorestackAddr = false;
   // Always emit some info, by default "no personality" info.
   Personalities.push_back(nullptr);
+  PersonalityTypeCache = EHPersonality::Unknown;
   AddrLabelSymbols = nullptr;
   TheModule = nullptr;
 
@@ -452,6 +454,14 @@
   LP.TypeIds.push_back(0);
 }
 
+MCSymbol *
+MachineModuleInfo::addClauseForLandingPad(MachineBasicBlock *LandingPad) {
+  MCSymbol *ClauseLabel = Context.CreateTempSymbol();
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  LP.ClauseLabels.push_back(ClauseLabel);
+  return ClauseLabel;
+}
+
 /// TidyLandingPads - Remap landing pad labels and remove any deleted landing
 /// pads.
 void MachineModuleInfo::TidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap) {
@@ -546,11 +556,17 @@
 
 /// getPersonality - Return the personality function for the current function.
 const Function *MachineModuleInfo::getPersonality() const {
-  // FIXME: Until PR1414 will be fixed, we're using 1 personality function per
-  // function
-  return !LandingPads.empty() ? LandingPads[0].Personality : nullptr;
+  for (const LandingPadInfo &LPI : LandingPads)
+    if (LPI.Personality)
+      return LPI.Personality;
+  return nullptr;
 }
 
+EHPersonality MachineModuleInfo::getPersonalityType() {
+  if (PersonalityTypeCache == EHPersonality::Unknown)
+    PersonalityTypeCache = classifyEHPersonality(getPersonality());
+  return PersonalityTypeCache;
+}
 /// getPersonalityIndex - Return unique index for current personality
 /// function. NULL/first personality function should always get zero index.
 unsigned MachineModuleInfo::getPersonalityIndex() const {

diff --git a/lib/CodeGen/MachineRegionInfo.cpp b/lib/CodeGen/MachineRegionInfo.cpp
index 5a5035e..01d2c2e 100644
--- a/lib/CodeGen/MachineRegionInfo.cpp
+++ b/lib/CodeGen/MachineRegionInfo.cpp

@@ -1,8 +1,8 @@
 
 #include "llvm/CodeGen/MachineRegionInfo.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/RegionInfoImpl.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 
 #define DEBUG_TYPE "region"
 

diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index e9612f3..32b7db1 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp

@@ -24,7 +24,8 @@
 void MachineRegisterInfo::Delegate::anchor() {}
 
 MachineRegisterInfo::MachineRegisterInfo(const MachineFunction *MF)
-  : MF(MF), TheDelegate(nullptr), IsSSA(true), TracksLiveness(true) {
+  : MF(MF), TheDelegate(nullptr), IsSSA(true), TracksLiveness(true),
+    TracksSubRegLiveness(false) {
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
   UsedRegUnits.resize(getTargetRegisterInfo()->getNumRegUnits());
@@ -60,8 +61,8 @@
 }
 
 bool
-MachineRegisterInfo::recomputeRegClass(unsigned Reg, const TargetMachine &TM) {
-  const TargetInstrInfo *TII = TM.getSubtargetImpl()->getInstrInfo();
+MachineRegisterInfo::recomputeRegClass(unsigned Reg) {
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   const TargetRegisterClass *OldRC = getRegClass(Reg);
   const TargetRegisterClass *NewRC =
     getTargetRegisterInfo()->getLargestLegalSuperClass(OldRC);
@@ -128,6 +129,7 @@
              << " use list MachineOperand " << MO
              << " has no parent instruction.\n";
       Valid = false;
+      continue;
     }
     MachineOperand *MO0 = &MI->getOperand(0);
     unsigned NumOps = MI->getNumOperands();
@@ -391,6 +393,14 @@
     }
 }
 
+unsigned MachineRegisterInfo::getMaxLaneMaskForVReg(unsigned Reg) const
+{
+  // Lane masks are only defined for vregs.
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  const TargetRegisterClass &TRC = *getRegClass(Reg);
+  return TRC.getLaneMask();
+}
+
 #ifndef NDEBUG
 void MachineRegisterInfo::dumpUses(unsigned Reg) const {
   for (MachineInstr &I : use_instructions(Reg))

diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 261942f..89ac6a8 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp

@@ -144,12 +144,12 @@
 
 char &llvm::MachineSchedulerID = MachineScheduler::ID;
 
-INITIALIZE_PASS_BEGIN(MachineScheduler, "misched",
+INITIALIZE_PASS_BEGIN(MachineScheduler, "machine-scheduler",
                       "Machine Instruction Scheduler", false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_END(MachineScheduler, "misched",
+INITIALIZE_PASS_END(MachineScheduler, "machine-scheduler",
                     "Machine Instruction Scheduler", false, false)
 
 MachineScheduler::MachineScheduler()
@@ -336,9 +336,7 @@
   if (skipOptnoneFunction(*mf.getFunction()))
     return false;
 
-  const TargetSubtargetInfo &ST =
-    mf.getTarget().getSubtarget<TargetSubtargetInfo>();
-  if (!ST.enablePostMachineScheduler()) {
+  if (!mf.getSubtarget().enablePostMachineScheduler()) {
     DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n");
     return false;
   }
@@ -430,9 +428,11 @@
       // instruction stream until we find the nearest boundary.
       unsigned NumRegionInstrs = 0;
       MachineBasicBlock::iterator I = RegionEnd;
-      for(;I != MBB->begin(); --I, --RemainingInstrs, ++NumRegionInstrs) {
+      for(;I != MBB->begin(); --I, --RemainingInstrs) {
         if (isSchedBoundary(std::prev(I), MBB, MF, TII, IsPostRA))
           break;
+        if (!I->isDebugValue())
+          ++NumRegionInstrs;
       }
       // Notify the scheduler of the region, even if we may skip scheduling
       // it. Perhaps it still needs to be bundled.
@@ -1432,12 +1432,15 @@
   // Check if either the dest or source is local. If it's live across a back
   // edge, it's not local. Note that if both vregs are live across the back
   // edge, we cannot successfully contrain the copy without cyclic scheduling.
-  unsigned LocalReg = DstReg;
-  unsigned GlobalReg = SrcReg;
+  // If both the copy's source and dest are local live intervals, then we
+  // should treat the dest as the global for the purpose of adding
+  // constraints. This adds edges from source's other uses to the copy.
+  unsigned LocalReg = SrcReg;
+  unsigned GlobalReg = DstReg;
   LiveInterval *LocalLI = &LIS->getInterval(LocalReg);
   if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) {
-    LocalReg = SrcReg;
-    GlobalReg = DstReg;
+    LocalReg = DstReg;
+    GlobalReg = SrcReg;
     LocalLI = &LIS->getInterval(LocalReg);
     if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx))
       return;

diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index ba25bca..8337793 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp

@@ -112,7 +112,7 @@
     /// for the lifetime of an iteration.
     ///
     /// \return True if the edge is marked as toSplit, false otherwise.
-    /// False can be retruned if, for instance, this is not profitable.
+    /// False can be returned if, for instance, this is not profitable.
     bool PostponeSplitCriticalEdge(MachineInstr *MI,
                                    MachineBasicBlock *From,
                                    MachineBasicBlock *To,
@@ -504,7 +504,7 @@
   // If SuccToSinkTo post dominates then also it may be profitable if MI
   // can further profitably sinked into another block in next round.
   bool BreakPHIEdge = false;
-  // FIXME - If finding successor is compile time expensive then catch results.
+  // FIXME - If finding successor is compile time expensive then cache results.
   if (MachineBasicBlock *MBB2 = FindSuccToSinkTo(MI, SuccToSinkTo, BreakPHIEdge))
     return isProfitableToSinkTo(Reg, MI, SuccToSinkTo, MBB2);
 
@@ -553,19 +553,6 @@
       if (!TII->isSafeToMoveRegClassDefs(MRI->getRegClass(Reg)))
         return nullptr;
 
-      // FIXME: This picks a successor to sink into based on having one
-      // successor that dominates all the uses.  However, there are cases where
-      // sinking can happen but where the sink point isn't a successor.  For
-      // example:
-      //
-      //   x = computation
-      //   if () {} else {}
-      //   use x
-      //
-      // the instruction could be sunk over the whole diamond for the
-      // if/then/else (or loop, etc), allowing it to be sunk into other blocks
-      // after that.
-
       // Virtual register defs can only be sunk if all their uses are in blocks
       // dominated by one of the successors.
       if (SuccToSinkTo) {
@@ -585,6 +572,23 @@
       // higher priority, otherwise prioritize smaller loop depths.
       SmallVector<MachineBasicBlock*, 4> Succs(MBB->succ_begin(),
                                                MBB->succ_end());
+
+      // Handle cases where sinking can happen but where the sink point isn't a
+      // successor. For example:
+      //
+      //   x = computation
+      //   if () {} else {}
+      //   use x
+      //
+      const std::vector<MachineDomTreeNode *> &Children =
+        DT->getNode(MBB)->getChildren();
+      for (const auto &DTChild : Children)
+        // DomTree children of MBB that have MBB as immediate dominator are added.
+        if (DTChild->getIDom()->getBlock() == MI->getParent() &&
+            // Skip MBBs already added to the Succs vector above.
+            !MBB->isSuccessor(DTChild->getBlock()))
+          Succs.push_back(DTChild->getBlock());
+
       // Sort Successors according to their loop depth or block frequency info.
       std::stable_sort(
           Succs.begin(), Succs.end(),

diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index 2cf87eb..8aacd1f 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp

@@ -52,12 +52,11 @@
 
 bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) {
   MF = &Func;
-  TII = MF->getSubtarget().getInstrInfo();
-  TRI = MF->getSubtarget().getRegisterInfo();
+  const TargetSubtargetInfo &ST = MF->getSubtarget();
+  TII = ST.getInstrInfo();
+  TRI = ST.getRegisterInfo();
   MRI = &MF->getRegInfo();
   Loops = &getAnalysis<MachineLoopInfo>();
-  const TargetSubtargetInfo &ST =
-    MF->getTarget().getSubtarget<TargetSubtargetInfo>();
   SchedModel.init(ST.getSchedModel(), &ST, TII);
   BlockInfo.resize(MF->getNumBlockIDs());
   ProcResourceCycles.resize(MF->getNumBlockIDs() *

diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 99f0583..bdb094f 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp

@@ -42,6 +42,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -54,16 +55,13 @@
 
     MachineVerifier(Pass *pass, const char *b) :
       PASS(pass),
-      Banner(b),
-      OutFileName(getenv("LLVM_VERIFY_MACHINEINSTRS"))
+      Banner(b)
       {}
 
     bool runOnMachineFunction(MachineFunction &MF);
 
     Pass *const PASS;
     const char *Banner;
-    const char *const OutFileName;
-    raw_ostream *OS;
     const MachineFunction *MF;
     const TargetMachine *TM;
     const TargetInstrInfo *TII;
@@ -215,9 +213,9 @@
     void report(const char *msg, const MachineBasicBlock *MBB,
                 const LiveInterval &LI);
     void report(const char *msg, const MachineFunction *MF,
-                const LiveRange &LR, unsigned Reg);
+                const LiveRange &LR, unsigned Reg, unsigned LaneMask);
     void report(const char *msg, const MachineBasicBlock *MBB,
-                const LiveRange &LR, unsigned Reg);
+                const LiveRange &LR, unsigned Reg, unsigned LaneMask);
 
     void verifyInlineAsm(const MachineInstr *MI);
 
@@ -230,20 +228,22 @@
     void verifyLiveVariables();
     void verifyLiveIntervals();
     void verifyLiveInterval(const LiveInterval&);
-    void verifyLiveRangeValue(const LiveRange&, const VNInfo*, unsigned);
+    void verifyLiveRangeValue(const LiveRange&, const VNInfo*, unsigned,
+                              unsigned);
     void verifyLiveRangeSegment(const LiveRange&,
-                                const LiveRange::const_iterator I, unsigned);
-    void verifyLiveRange(const LiveRange&, unsigned);
+                                const LiveRange::const_iterator I, unsigned,
+                                unsigned);
+    void verifyLiveRange(const LiveRange&, unsigned, unsigned LaneMask = 0);
 
     void verifyStackFrame();
   };
 
   struct MachineVerifierPass : public MachineFunctionPass {
     static char ID; // Pass ID, replacement for typeid
-    const char *const Banner;
+    const std::string Banner;
 
-    MachineVerifierPass(const char *b = nullptr)
-      : MachineFunctionPass(ID), Banner(b) {
+    MachineVerifierPass(const std::string &banner = nullptr)
+      : MachineFunctionPass(ID), Banner(banner) {
         initializeMachineVerifierPassPass(*PassRegistry::getPassRegistry());
       }
 
@@ -253,7 +253,7 @@
     }
 
     bool runOnMachineFunction(MachineFunction &MF) override {
-      MF.verify(this, Banner);
+      MF.verify(this, Banner.c_str());
       return false;
     }
   };
@@ -264,7 +264,7 @@
 INITIALIZE_PASS(MachineVerifierPass, "machineverifier",
                 "Verify generated machine code", false, false)
 
-FunctionPass *llvm::createMachineVerifierPass(const char *Banner) {
+FunctionPass *llvm::createMachineVerifierPass(const std::string &Banner) {
   return new MachineVerifierPass(Banner);
 }
 
@@ -274,22 +274,6 @@
 }
 
 bool MachineVerifier::runOnMachineFunction(MachineFunction &MF) {
-  raw_ostream *OutFile = nullptr;
-  if (OutFileName) {
-    std::error_code EC;
-    OutFile = new raw_fd_ostream(OutFileName, EC,
-                                 sys::fs::F_Append | sys::fs::F_Text);
-    if (EC) {
-      errs() << "Error opening '" << OutFileName << "': " << EC.message()
-             << '\n';
-      exit(1);
-    }
-
-    OS = OutFile;
-  } else {
-    OS = &errs();
-  }
-
   foundErrors = 0;
 
   this->MF = &MF;
@@ -324,7 +308,7 @@
            MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI) {
       if (MBBI->getParent() != MFI) {
         report("Bad instruction parent pointer", MFI);
-        *OS << "Instruction: " << *MBBI;
+        errs() << "Instruction: " << *MBBI;
         continue;
       }
 
@@ -360,9 +344,7 @@
   }
   visitMachineFunctionAfter();
 
-  if (OutFile)
-    delete OutFile;
-  else if (foundErrors)
+  if (foundErrors)
     report_fatal_error("Found "+Twine(foundErrors)+" machine code errors.");
 
   // Clean up.
@@ -379,70 +361,76 @@
 
 void MachineVerifier::report(const char *msg, const MachineFunction *MF) {
   assert(MF);
-  *OS << '\n';
+  errs() << '\n';
   if (!foundErrors++) {
     if (Banner)
-      *OS << "# " << Banner << '\n';
-    MF->print(*OS, Indexes);
+      errs() << "# " << Banner << '\n';
+    MF->print(errs(), Indexes);
   }
-  *OS << "*** Bad machine code: " << msg << " ***\n"
+  errs() << "*** Bad machine code: " << msg << " ***\n"
       << "- function:    " << MF->getName() << "\n";
 }
 
 void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB) {
   assert(MBB);
   report(msg, MBB->getParent());
-  *OS << "- basic block: BB#" << MBB->getNumber()
+  errs() << "- basic block: BB#" << MBB->getNumber()
       << ' ' << MBB->getName()
       << " (" << (const void*)MBB << ')';
   if (Indexes)
-    *OS << " [" << Indexes->getMBBStartIdx(MBB)
+    errs() << " [" << Indexes->getMBBStartIdx(MBB)
         << ';' <<  Indexes->getMBBEndIdx(MBB) << ')';
-  *OS << '\n';
+  errs() << '\n';
 }
 
 void MachineVerifier::report(const char *msg, const MachineInstr *MI) {
   assert(MI);
   report(msg, MI->getParent());
-  *OS << "- instruction: ";
+  errs() << "- instruction: ";
   if (Indexes && Indexes->hasIndex(MI))
-    *OS << Indexes->getInstructionIndex(MI) << '\t';
-  MI->print(*OS, TM);
+    errs() << Indexes->getInstructionIndex(MI) << '\t';
+  MI->print(errs(), TM);
 }
 
 void MachineVerifier::report(const char *msg,
                              const MachineOperand *MO, unsigned MONum) {
   assert(MO);
   report(msg, MO->getParent());
-  *OS << "- operand " << MONum << ":   ";
-  MO->print(*OS, TM);
-  *OS << "\n";
+  errs() << "- operand " << MONum << ":   ";
+  MO->print(errs(), TM);
+  errs() << "\n";
 }
 
 void MachineVerifier::report(const char *msg, const MachineFunction *MF,
                              const LiveInterval &LI) {
   report(msg, MF);
-  *OS << "- interval:    " << LI << '\n';
+  errs() << "- interval:    " << LI << '\n';
 }
 
 void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB,
                              const LiveInterval &LI) {
   report(msg, MBB);
-  *OS << "- interval:    " << LI << '\n';
+  errs() << "- interval:    " << LI << '\n';
 }
 
 void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB,
-                             const LiveRange &LR, unsigned Reg) {
+                             const LiveRange &LR, unsigned Reg,
+                             unsigned LaneMask) {
   report(msg, MBB);
-  *OS << "- liverange:   " << LR << '\n';
-  *OS << "- register:    " << PrintReg(Reg, TRI) << '\n';
+  errs() << "- liverange:   " << LR << '\n';
+  errs() << "- register:    " << PrintReg(Reg, TRI) << '\n';
+  if (LaneMask != 0)
+    errs() << "- lanemask:    " << format("%04X\n", LaneMask);
 }
 
 void MachineVerifier::report(const char *msg, const MachineFunction *MF,
-                             const LiveRange &LR, unsigned Reg) {
+                             const LiveRange &LR, unsigned Reg,
+                             unsigned LaneMask) {
   report(msg, MF);
-  *OS << "- liverange:   " << LR << '\n';
-  *OS << "- register:    " << PrintReg(Reg, TRI) << '\n';
+  errs() << "- liverange:   " << LR << '\n';
+  errs() << "- register:    " << PrintReg(Reg, TRI) << '\n';
+  if (LaneMask != 0)
+    errs() << "- lanemask:    " << format("%04X\n", LaneMask);
 }
 
 void MachineVerifier::markReachable(const MachineBasicBlock *MBB) {
@@ -530,7 +518,7 @@
       report("MBB has successor that isn't part of the function.", MBB);
     if (!MBBInfoMap[*I].Preds.count(MBB)) {
       report("Inconsistent CFG", MBB);
-      *OS << "MBB is not in the predecessor list of the successor BB#"
+      errs() << "MBB is not in the predecessor list of the successor BB#"
           << (*I)->getNumber() << ".\n";
     }
   }
@@ -542,7 +530,7 @@
       report("MBB has predecessor that isn't part of the function.", MBB);
     if (!MBBInfoMap[*I].Succs.count(MBB)) {
       report("Inconsistent CFG", MBB);
-      *OS << "MBB is not in the successor list of the predecessor BB#"
+      errs() << "MBB is not in the successor list of the predecessor BB#"
           << (*I)->getNumber() << ".\n";
     }
   }
@@ -592,7 +580,11 @@
       }
     } else if (TBB && !FBB && Cond.empty()) {
       // Block unconditionally branches somewhere.
-      if (MBB->succ_size() != 1+LandingPadSuccs.size()) {
+      // If the block has exactly one successor, that happens to be a
+      // landingpad, accept it as valid control flow.
+      if (MBB->succ_size() != 1+LandingPadSuccs.size() &&
+          (MBB->succ_size() != 1 || LandingPadSuccs.size() != 1 ||
+           *MBB->succ_begin() != *LandingPadSuccs.begin())) {
         report("MBB exits via unconditional branch but doesn't have "
                "exactly one CFG successor!", MBB);
       } else if (!MBB->isSuccessor(TBB)) {
@@ -713,7 +705,7 @@
     SlotIndex idx = Indexes->getInstructionIndex(MI);
     if (!(idx > lastIndex)) {
       report("Instruction index out of order", MI);
-      *OS << "Last instruction was at " << lastIndex << '\n';
+      errs() << "Last instruction was at " << lastIndex << '\n';
     }
     lastIndex = idx;
   }
@@ -726,7 +718,7 @@
       FirstTerminator = MI;
   } else if (FirstTerminator) {
     report("Non-terminator instruction after the first terminator", MI);
-    *OS << "First terminator was:\t" << *FirstTerminator;
+    errs() << "First terminator was:\t" << *FirstTerminator;
   }
 }
 
@@ -778,7 +770,7 @@
   const MCInstrDesc &MCID = MI->getDesc();
   if (MI->getNumOperands() < MCID.getNumOperands()) {
     report("Too few operands", MI);
-    *OS << MCID.getNumOperands() << " operands expected, but "
+    errs() << MCID.getNumOperands() << " operands expected, but "
         << MI->getNumOperands() << " given.\n";
   }
 
@@ -908,7 +900,7 @@
               TII->getRegClass(MCID, MONum, TRI, *MF)) {
           if (!DRC->contains(Reg)) {
             report("Illegal physical register for instruction", MO, MONum);
-            *OS << TRI->getName(Reg) << " is not a "
+            errs() << TRI->getName(Reg) << " is not a "
                 << TRI->getRegClassName(DRC) << " register.\n";
           }
         }
@@ -920,13 +912,13 @@
             TRI->getSubClassWithSubReg(RC, SubIdx);
           if (!SRC) {
             report("Invalid subregister index for virtual register", MO, MONum);
-            *OS << "Register class " << TRI->getRegClassName(RC)
+            errs() << "Register class " << TRI->getRegClassName(RC)
                 << " does not support subreg index " << SubIdx << "\n";
             return;
           }
           if (RC != SRC) {
             report("Invalid register class for subregister index", MO, MONum);
-            *OS << "Register class " << TRI->getRegClassName(RC)
+            errs() << "Register class " << TRI->getRegClassName(RC)
                 << " does not fully support subreg index " << SubIdx << "\n";
             return;
           }
@@ -948,7 +940,7 @@
           }
           if (!RC->hasSuperClassEq(DRC)) {
             report("Illegal virtual register for instruction", MO, MONum);
-            *OS << "Expected a " << TRI->getRegClassName(DRC)
+            errs() << "Expected a " << TRI->getRegClassName(DRC)
                 << " register, but got a " << TRI->getRegClassName(RC)
                 << " register\n";
           }
@@ -974,11 +966,11 @@
       SlotIndex Idx = LiveInts->getInstructionIndex(MI);
       if (MI->mayLoad() && !LI.liveAt(Idx.getRegSlot(true))) {
         report("Instruction loads from dead spill slot", MO, MONum);
-        *OS << "Live stack: " << LI << '\n';
+        errs() << "Live stack: " << LI << '\n';
       }
       if (MI->mayStore() && !LI.liveAt(Idx.getRegSlot())) {
         report("Instruction stores to dead spill slot", MO, MONum);
-        *OS << "Live stack: " << LI << '\n';
+        errs() << "Live stack: " << LI << '\n';
       }
     }
     break;
@@ -1017,12 +1009,12 @@
             LiveQueryResult LRQ = LR->Query(UseIdx);
             if (!LRQ.valueIn()) {
               report("No live segment at use", MO, MONum);
-              *OS << UseIdx << " is not live in " << PrintRegUnit(*Units, TRI)
+              errs() << UseIdx << " is not live in " << PrintRegUnit(*Units, TRI)
                   << ' ' << *LR << '\n';
             }
             if (MO->isKill() && !LRQ.isKill()) {
               report("Live range continues after kill flag", MO, MONum);
-              *OS << PrintRegUnit(*Units, TRI) << ' ' << *LR << '\n';
+              errs() << PrintRegUnit(*Units, TRI) << ' ' << *LR << '\n';
             }
           }
         }
@@ -1035,13 +1027,13 @@
           LiveQueryResult LRQ = LI.Query(UseIdx);
           if (!LRQ.valueIn()) {
             report("No live segment at use", MO, MONum);
-            *OS << UseIdx << " is not live in " << LI << '\n';
+            errs() << UseIdx << " is not live in " << LI << '\n';
           }
           // Check for extra kill flags.
           // Note that we allow missing kill flags for now.
           if (MO->isKill() && !LRQ.isKill()) {
             report("Live range continues after kill flag", MO, MONum);
-            *OS << "Live range: " << LI << '\n';
+            errs() << "Live range: " << LI << '\n';
           }
         } else {
           report("Virtual register has no live interval", MO, MONum);
@@ -1053,7 +1045,37 @@
     if (!regsLive.count(Reg)) {
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
         // Reserved registers may be used even when 'dead'.
-        if (!isReserved(Reg))
+        bool Bad = !isReserved(Reg);
+        // We are fine if just any subregister has a defined value.
+        if (Bad) {
+          for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid();
+               ++SubRegs) {
+            if (regsLive.count(*SubRegs)) {
+              Bad = false;
+              break;
+            }
+          }
+        }
+        // If there is an additional implicit-use of a super register we stop
+        // here. By definition we are fine if the super register is not
+        // (completely) dead, if the complete super register is dead we will
+        // get a report for its operand.
+        if (Bad) {
+          for (const MachineOperand &MOP : MI->uses()) {
+            if (!MOP.isReg())
+              continue;
+            if (!MOP.isImplicit())
+              continue;
+            for (MCSubRegIterator SubRegs(MOP.getReg(), TRI); SubRegs.isValid();
+                 ++SubRegs) {
+              if (*SubRegs == Reg) {
+                Bad = false;
+                break;
+              }
+            }
+          }
+        }
+        if (Bad)
           report("Using an undefined physical register", MO, MONum);
       } else if (MRI->def_empty(Reg)) {
         report("Reading virtual register without a def", MO, MONum);
@@ -1094,19 +1116,19 @@
           assert(VNI && "NULL valno is not allowed");
           if (VNI->def != DefIdx) {
             report("Inconsistent valno->def", MO, MONum);
-            *OS << "Valno " << VNI->id << " is not defined at "
+            errs() << "Valno " << VNI->id << " is not defined at "
               << DefIdx << " in " << LI << '\n';
           }
         } else {
           report("No live segment at def", MO, MONum);
-          *OS << DefIdx << " is not live in " << LI << '\n';
+          errs() << DefIdx << " is not live in " << LI << '\n';
         }
         // Check that, if the dead def flag is present, LiveInts agree.
         if (MO->isDead()) {
           LiveQueryResult LRQ = LI.Query(DefIdx);
           if (!LRQ.isDeadDef()) {
             report("Live range continues after dead def flag", MO, MONum);
-            *OS << "Live range: " << LI << '\n';
+            errs() << "Live range: " << LI << '\n';
           }
         }
       } else {
@@ -1148,7 +1170,7 @@
     SlotIndex stop = Indexes->getMBBEndIdx(MBB);
     if (!(stop > lastIndex)) {
       report("Block ends before last instruction index", MBB);
-      *OS << "Block ends at " << stop
+      errs() << "Block ends at " << stop
           << " last instruction was at " << lastIndex << '\n';
     }
     lastIndex = stop;
@@ -1250,7 +1272,7 @@
            PrE = MBB->pred_end(); PrI != PrE; ++PrI) {
       if (!seen.count(*PrI)) {
         report("Missing PHI operand", &BBI);
-        *OS << "BB#" << (*PrI)->getNumber()
+        errs() << "BB#" << (*PrI)->getNumber()
             << " is a predecessor according to the CFG.\n";
       }
     }
@@ -1281,7 +1303,7 @@
          ++I)
       if (MInfo.regsKilled.count(*I)) {
         report("Virtual register killed in block, but needed live out.", &MBB);
-        *OS << "Virtual register " << PrintReg(*I)
+        errs() << "Virtual register " << PrintReg(*I)
             << " is used after the block.\n";
       }
   }
@@ -1313,13 +1335,13 @@
       if (MInfo.vregsRequired.count(Reg)) {
         if (!VI.AliveBlocks.test(MBB.getNumber())) {
           report("LiveVariables: Block missing from AliveBlocks", &MBB);
-          *OS << "Virtual register " << PrintReg(Reg)
+          errs() << "Virtual register " << PrintReg(Reg)
               << " must be live through the block.\n";
         }
       } else {
         if (VI.AliveBlocks.test(MBB.getNumber())) {
           report("LiveVariables: Block should not be in AliveBlocks", &MBB);
-          *OS << "Virtual register " << PrintReg(Reg)
+          errs() << "Virtual register " << PrintReg(Reg)
               << " is not needed live through the block.\n";
         }
       }
@@ -1338,7 +1360,7 @@
 
     if (!LiveInts->hasInterval(Reg)) {
       report("Missing live interval for virtual register", MF);
-      *OS << PrintReg(Reg, TRI) << " still has defs or uses\n";
+      errs() << PrintReg(Reg, TRI) << " still has defs or uses\n";
       continue;
     }
 
@@ -1354,38 +1376,40 @@
 }
 
 void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
-                                           const VNInfo *VNI,
-                                           unsigned Reg) {
+                                           const VNInfo *VNI, unsigned Reg,
+                                           unsigned LaneMask) {
   if (VNI->isUnused())
     return;
 
   const VNInfo *DefVNI = LR.getVNInfoAt(VNI->def);
 
   if (!DefVNI) {
-    report("Valno not live at def and not marked unused", MF, LR, Reg);
-    *OS << "Valno #" << VNI->id << '\n';
+    report("Valno not live at def and not marked unused", MF, LR, Reg,
+           LaneMask);
+    errs() << "Valno #" << VNI->id << '\n';
     return;
   }
 
   if (DefVNI != VNI) {
-    report("Live segment at def has different valno", MF, LR, Reg);
-    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
+    report("Live segment at def has different valno", MF, LR, Reg, LaneMask);
+    errs() << "Valno #" << VNI->id << " is defined at " << VNI->def
         << " where valno #" << DefVNI->id << " is live\n";
     return;
   }
 
   const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(VNI->def);
   if (!MBB) {
-    report("Invalid definition index", MF, LR, Reg);
-    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
+    report("Invalid definition index", MF, LR, Reg, LaneMask);
+    errs() << "Valno #" << VNI->id << " is defined at " << VNI->def
         << " in " << LR << '\n';
     return;
   }
 
   if (VNI->isPHIDef()) {
     if (VNI->def != LiveInts->getMBBStartIdx(MBB)) {
-      report("PHIDef value is not defined at MBB start", MBB, LR, Reg);
-      *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
+      report("PHIDef value is not defined at MBB start", MBB, LR, Reg,
+             LaneMask);
+      errs() << "Valno #" << VNI->id << " is defined at " << VNI->def
           << ", not at the beginning of BB#" << MBB->getNumber() << '\n';
     }
     return;
@@ -1394,8 +1418,8 @@
   // Non-PHI def.
   const MachineInstr *MI = LiveInts->getInstructionFromIndex(VNI->def);
   if (!MI) {
-    report("No instruction at def index", MBB, LR, Reg);
-    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+    report("No instruction at def index", MBB, LR, Reg, LaneMask);
+    errs() << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
     return;
   }
 
@@ -1413,6 +1437,9 @@
             !TRI->hasRegUnit(MOI->getReg(), Reg))
           continue;
       }
+      if (LaneMask != 0 &&
+          (TRI->getSubRegIndexLaneMask(MOI->getSubReg()) & LaneMask) == 0)
+        continue;
       hasDef = true;
       if (MOI->isEarlyClobber())
         isEarlyClobber = true;
@@ -1420,7 +1447,7 @@
 
     if (!hasDef) {
       report("Defining instruction does not modify register", MI);
-      *OS << "Valno #" << VNI->id << " in " << LR << '\n';
+      errs() << "Valno #" << VNI->id << " in " << LR << '\n';
     }
 
     // Early clobber defs begin at USE slots, but other defs must begin at
@@ -1428,51 +1455,52 @@
     if (isEarlyClobber) {
       if (!VNI->def.isEarlyClobber()) {
         report("Early clobber def must be at an early-clobber slot", MBB, LR,
-               Reg);
-        *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+               Reg, LaneMask);
+        errs() << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
       }
     } else if (!VNI->def.isRegister()) {
       report("Non-PHI, non-early clobber def must be at a register slot",
-             MBB, LR, Reg);
-      *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+             MBB, LR, Reg, LaneMask);
+      errs() << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
     }
   }
 }
 
 void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
                                              const LiveRange::const_iterator I,
-                                             unsigned Reg) {
+                                             unsigned Reg, unsigned LaneMask) {
   const LiveRange::Segment &S = *I;
   const VNInfo *VNI = S.valno;
   assert(VNI && "Live segment has no valno");
 
   if (VNI->id >= LR.getNumValNums() || VNI != LR.getValNumInfo(VNI->id)) {
-    report("Foreign valno in live segment", MF, LR, Reg);
-    *OS << S << " has a bad valno\n";
+    report("Foreign valno in live segment", MF, LR, Reg, LaneMask);
+    errs() << S << " has a bad valno\n";
   }
 
   if (VNI->isUnused()) {
-    report("Live segment valno is marked unused", MF, LR, Reg);
-    *OS << S << '\n';
+    report("Live segment valno is marked unused", MF, LR, Reg, LaneMask);
+    errs() << S << '\n';
   }
 
   const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(S.start);
   if (!MBB) {
-    report("Bad start of live segment, no basic block", MF, LR, Reg);
-    *OS << S << '\n';
+    report("Bad start of live segment, no basic block", MF, LR, Reg, LaneMask);
+    errs() << S << '\n';
     return;
   }
   SlotIndex MBBStartIdx = LiveInts->getMBBStartIdx(MBB);
   if (S.start != MBBStartIdx && S.start != VNI->def) {
-    report("Live segment must begin at MBB entry or valno def", MBB, LR, Reg);
-    *OS << S << '\n';
+    report("Live segment must begin at MBB entry or valno def", MBB, LR, Reg,
+           LaneMask);
+    errs() << S << '\n';
   }
 
   const MachineBasicBlock *EndMBB =
     LiveInts->getMBBFromIndex(S.end.getPrevSlot());
   if (!EndMBB) {
-    report("Bad end of live segment, no basic block", MF, LR, Reg);
-    *OS << S << '\n';
+    report("Bad end of live segment, no basic block", MF, LR, Reg, LaneMask);
+    errs() << S << '\n';
     return;
   }
 
@@ -1489,15 +1517,17 @@
   const MachineInstr *MI =
     LiveInts->getInstructionFromIndex(S.end.getPrevSlot());
   if (!MI) {
-    report("Live segment doesn't end at a valid instruction", EndMBB, LR, Reg);
-    *OS << S << '\n';
+    report("Live segment doesn't end at a valid instruction", EndMBB, LR, Reg,
+           LaneMask);
+    errs() << S << '\n';
     return;
   }
 
   // The block slot must refer to a basic block boundary.
   if (S.end.isBlock()) {
-    report("Live segment ends at B slot of an instruction", EndMBB, LR, Reg);
-    *OS << S << '\n';
+    report("Live segment ends at B slot of an instruction", EndMBB, LR, Reg,
+           LaneMask);
+    errs() << S << '\n';
   }
 
   if (S.end.isDead()) {
@@ -1505,8 +1535,8 @@
     // That means there must be a dead def.
     if (!SlotIndex::isSameInstr(S.start, S.end)) {
       report("Live segment ending at dead slot spans instructions", EndMBB, LR,
-             Reg);
-      *OS << S << '\n';
+             Reg, LaneMask);
+      errs() << S << '\n';
     }
   }
 
@@ -1515,8 +1545,9 @@
   if (S.end.isEarlyClobber()) {
     if (I+1 == LR.end() || (I+1)->start != S.end) {
       report("Live segment ending at early clobber slot must be "
-             "redefined by an EC def in the same instruction", EndMBB, LR, Reg);
-      *OS << S << '\n';
+             "redefined by an EC def in the same instruction", EndMBB, LR, Reg,
+             LaneMask);
+      errs() << S << '\n';
     }
   }
 
@@ -1526,16 +1557,27 @@
     // A live segment can end with either a redefinition, a kill flag on a
     // use, or a dead flag on a def.
     bool hasRead = false;
+    bool hasSubRegDef = false;
     for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
       if (!MOI->isReg() || MOI->getReg() != Reg)
         continue;
+      if (LaneMask != 0 &&
+          (LaneMask & TRI->getSubRegIndexLaneMask(MOI->getSubReg())) == 0)
+        continue;
+      if (MOI->isDef() && MOI->getSubReg() != 0)
+        hasSubRegDef = true;
       if (MOI->readsReg())
         hasRead = true;
     }
     if (!S.end.isDead()) {
       if (!hasRead) {
-        report("Instruction ending live segment doesn't read the register", MI);
-        *OS << S << " in " << LR << '\n';
+        // When tracking subregister liveness, the main range must start new
+        // values on partial register writes, even if there is no read.
+        if (!MRI->tracksSubRegLiveness() || LaneMask != 0 || !hasSubRegDef) {
+          report("Instruction ending live segment doesn't read the register",
+                 MI);
+          errs() << S << " in " << LR << '\n';
+        }
       }
     }
   }
@@ -1573,8 +1615,9 @@
 
       // All predecessors must have a live-out value.
       if (!PVNI) {
-        report("Register not marked live out of predecessor", *PI, LR, Reg);
-        *OS << "Valno #" << VNI->id << " live into BB#" << MFI->getNumber()
+        report("Register not marked live out of predecessor", *PI, LR, Reg,
+               LaneMask);
+        errs() << "Valno #" << VNI->id << " live into BB#" << MFI->getNumber()
             << '@' << LiveInts->getMBBStartIdx(MFI) << ", not live before "
             << PEnd << '\n';
         continue;
@@ -1582,8 +1625,9 @@
 
       // Only PHI-defs can take different predecessor values.
       if (!IsPHI && PVNI != VNI) {
-        report("Different value live out of predecessor", *PI, LR, Reg);
-        *OS << "Valno #" << PVNI->id << " live out of BB#"
+        report("Different value live out of predecessor", *PI, LR, Reg,
+               LaneMask);
+        errs() << "Valno #" << PVNI->id << " live out of BB#"
             << (*PI)->getNumber() << '@' << PEnd
             << "\nValno #" << VNI->id << " live into BB#" << MFI->getNumber()
             << '@' << LiveInts->getMBBStartIdx(MFI) << '\n';
@@ -1595,18 +1639,36 @@
   }
 }
 
-void MachineVerifier::verifyLiveRange(const LiveRange &LR, unsigned Reg) {
-  for (LiveRange::const_vni_iterator I = LR.vni_begin(), E = LR.vni_end();
-       I != E; ++I)
-    verifyLiveRangeValue(LR, *I, Reg);
+void MachineVerifier::verifyLiveRange(const LiveRange &LR, unsigned Reg,
+                                      unsigned LaneMask) {
+  for (const VNInfo *VNI : LR.valnos)
+    verifyLiveRangeValue(LR, VNI, Reg, LaneMask);
 
   for (LiveRange::const_iterator I = LR.begin(), E = LR.end(); I != E; ++I)
-    verifyLiveRangeSegment(LR, I, Reg);
+    verifyLiveRangeSegment(LR, I, Reg, LaneMask);
 }
 
 void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
   verifyLiveRange(LI, LI.reg);
 
+  unsigned Reg = LI.reg;
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    unsigned Mask = 0;
+    unsigned MaxMask = MRI->getMaxLaneMaskForVReg(Reg);
+    for (const LiveInterval::SubRange &SR : LI.subranges()) {
+      if ((Mask & SR.LaneMask) != 0)
+        report("Lane masks of sub ranges overlap in live interval", MF, LI);
+      if ((SR.LaneMask & ~MaxMask) != 0)
+        report("Subrange lanemask is invalid", MF, LI);
+      Mask |= SR.LaneMask;
+      verifyLiveRange(SR, LI.reg, SR.LaneMask);
+      if (!LI.covers(SR))
+        report("A Subrange is not covered by the main range", MF, LI);
+    }
+  } else if (LI.hasSubRanges()) {
+    report("subregister liveness only allowed for virtual registers", MF, LI);
+  }
+
   // Check the LI only has one connected component.
   if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
     ConnectedVNInfoEqClasses ConEQ(*LiveInts);
@@ -1614,12 +1676,12 @@
     if (NumComp > 1) {
       report("Multiple connected components in live interval", MF, LI);
       for (unsigned comp = 0; comp != NumComp; ++comp) {
-        *OS << comp << ": valnos";
+        errs() << comp << ": valnos";
         for (LiveInterval::const_vni_iterator I = LI.vni_begin(),
              E = LI.vni_end(); I!=E; ++I)
           if (comp == ConEQ.getEqClass(*I))
-            *OS << ' ' << (*I)->id;
-        *OS << '\n';
+            errs() << ' ' << (*I)->id;
+        errs() << '\n';
       }
     }
   }
@@ -1700,7 +1762,7 @@
                                                BBState.ExitValue;
         if (BBState.ExitIsSetup && AbsSPAdj != Size) {
           report("FrameDestroy <n> is after FrameSetup <m>", &I);
-          *OS << "FrameDestroy <" << Size << "> is after FrameSetup <"
+          errs() << "FrameDestroy <" << Size << "> is after FrameSetup <"
               << AbsSPAdj << ">.\n";
         }
         BBState.ExitValue += Size;
@@ -1717,7 +1779,7 @@
           (SPState[(*I)->getNumber()].ExitValue != BBState.EntryValue ||
            SPState[(*I)->getNumber()].ExitIsSetup != BBState.EntryIsSetup)) {
         report("The exit stack state of a predecessor is inconsistent.", MBB);
-        *OS << "Predecessor BB#" << (*I)->getNumber() << " has exit state ("
+        errs() << "Predecessor BB#" << (*I)->getNumber() << " has exit state ("
             << SPState[(*I)->getNumber()].ExitValue << ", "
             << SPState[(*I)->getNumber()].ExitIsSetup
             << "), while BB#" << MBB->getNumber() << " has entry state ("
@@ -1733,7 +1795,7 @@
           (SPState[(*I)->getNumber()].EntryValue != BBState.ExitValue ||
            SPState[(*I)->getNumber()].EntryIsSetup != BBState.ExitIsSetup)) {
         report("The entry stack state of a successor is inconsistent.", MBB);
-        *OS << "Successor BB#" << (*I)->getNumber() << " has entry state ("
+        errs() << "Successor BB#" << (*I)->getNumber() << " has entry state ("
             << SPState[(*I)->getNumber()].EntryValue << ", "
             << SPState[(*I)->getNumber()].EntryIsSetup
             << "), while BB#" << MBB->getNumber() << " has exit state ("

diff --git a/lib/CodeGen/OcamlGC.cpp b/lib/CodeGen/OcamlGC.cpp
index 48db200..17654a6 100644
--- a/lib/CodeGen/OcamlGC.cpp
+++ b/lib/CodeGen/OcamlGC.cpp

@@ -20,16 +20,15 @@
 using namespace llvm;
 
 namespace {
-  class OcamlGC : public GCStrategy {
-  public:
-    OcamlGC();
-  };
+class OcamlGC : public GCStrategy {
+public:
+  OcamlGC();
+};
 }
 
-static GCRegistry::Add<OcamlGC>
-X("ocaml", "ocaml 3.10-compatible GC");
+static GCRegistry::Add<OcamlGC> X("ocaml", "ocaml 3.10-compatible GC");
 
-void llvm::linkOcamlGC() { }
+void llvm::linkOcamlGC() {}
 
 OcamlGC::OcamlGC() {
   NeededSafePoints = 1 << GC::PostCall;

diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index ec71d86..272d068 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp

@@ -14,13 +14,12 @@
 
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Analysis/Passes.h"
-#include "llvm/CodeGen/GCStrategy.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -82,7 +81,9 @@
     cl::desc("Dump garbage collector data"));
 static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden,
     cl::desc("Verify generated machine code"),
-    cl::init(getenv("LLVM_VERIFY_MACHINEINSTRS")!=nullptr));
+    cl::init(false),
+    cl::ZeroOrMore);
+
 static cl::opt<std::string>
 PrintMachineInstrs("print-machineinstrs", cl::ValueOptional,
                    cl::desc("Print machine instrs"),
@@ -235,8 +236,8 @@
 // registers all common codegen passes.
 TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
   : ImmutablePass(ID), PM(&pm), StartAfter(nullptr), StopAfter(nullptr),
-    Started(true), Stopped(false), TM(tm), Impl(nullptr), Initialized(false),
-    DisableVerify(false),
+    Started(true), Stopped(false), AddingMachinePasses(false), TM(tm),
+    Impl(nullptr), Initialized(false), DisableVerify(false),
     EnableTailMerge(true) {
 
   Impl = new PassConfigImpl();
@@ -250,7 +251,7 @@
   substitutePass(&PostRAMachineLICMID, &MachineLICMID);
 
   // Temporarily disable experimental passes.
-  const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>();
+  const TargetSubtargetInfo &ST = *TM->getSubtargetImpl();
   if (!ST.useMachineScheduler())
     disablePass(&MachineSchedulerID);
 }
@@ -304,7 +305,7 @@
 /// a later pass or that it should stop after an earlier pass, then do not add
 /// the pass.  Finally, compare the current pass against the StartAfter
 /// and StopAfter options and change the Started/Stopped flags accordingly.
-void TargetPassConfig::addPass(Pass *P) {
+void TargetPassConfig::addPass(Pass *P, bool verifyAfter, bool printAfter) {
   assert(!Initialized && "PassConfig is immutable");
 
   // Cache the Pass ID here in case the pass manager finds this pass is
@@ -313,10 +314,21 @@
   // and shouldn't reference it.
   AnalysisID PassID = P->getPassID();
 
-  if (Started && !Stopped)
+  if (Started && !Stopped) {
+    std::string Banner;
+    // Construct banner message before PM->add() as that may delete the pass.
+    if (AddingMachinePasses && (printAfter || verifyAfter))
+      Banner = std::string("After ") + std::string(P->getPassName());
     PM->add(P);
-  else
+    if (AddingMachinePasses) {
+      if (printAfter)
+        addPrintPass(Banner);
+      if (verifyAfter)
+        addVerifyPass(Banner);
+    }
+  } else {
     delete P;
+  }
   if (StopAfter == PassID)
     Stopped = true;
   if (StartAfter == PassID)
@@ -330,7 +342,8 @@
 ///
 /// addPass cannot return a pointer to the pass instance because is internal the
 /// PassManager and the instance we create here may already be freed.
-AnalysisID TargetPassConfig::addPass(AnalysisID PassID) {
+AnalysisID TargetPassConfig::addPass(AnalysisID PassID, bool verifyAfter,
+                                     bool printAfter) {
   IdentifyingPassPtr TargetID = getPassSubstitution(PassID);
   IdentifyingPassPtr FinalPtr = overridePass(PassID, TargetID);
   if (!FinalPtr.isValid())
@@ -345,7 +358,7 @@
       llvm_unreachable("Pass ID not registered");
   }
   AnalysisID FinalID = P->getPassID();
-  addPass(P); // Ends the lifetime of P.
+  addPass(P, verifyAfter, printAfter); // Ends the lifetime of P.
 
   // Add the passes after the pass P if there is any.
   for (SmallVectorImpl<std::pair<AnalysisID, IdentifyingPassPtr> >::iterator
@@ -360,18 +373,25 @@
         NP = Pass::createPass((*I).second.getID());
         assert(NP && "Pass ID not registered");
       }
-      addPass(NP);
+      addPass(NP, false, false);
     }
   }
   return FinalID;
 }
 
-void TargetPassConfig::printAndVerify(const char *Banner) {
-  if (TM->shouldPrintMachineCode())
-    addPass(createMachineFunctionPrinterPass(dbgs(), Banner));
+void TargetPassConfig::printAndVerify(const std::string &Banner) {
+  addPrintPass(Banner);
+  addVerifyPass(Banner);
+}
 
+void TargetPassConfig::addPrintPass(const std::string &Banner) {
+  if (TM->shouldPrintMachineCode())
+    PM->add(createMachineFunctionPrinterPass(dbgs(), Banner));
+}
+
+void TargetPassConfig::addVerifyPass(const std::string &Banner) {
   if (VerifyMachineCode)
-    addPass(createMachineVerifierPass(Banner));
+    PM->add(createMachineVerifierPass(Banner));
 }
 
 /// Add common target configurable passes that perform LLVM IR to IR transforms
@@ -401,7 +421,10 @@
       addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
   }
 
+  // Run GC lowering passes for builtin collectors
+  // TODO: add a pass insertion point here
   addPass(createGCLoweringPass());
+  addPass(createShadowStackGCLoweringPass());
 
   // Make sure that no unreachable blocks are instruction selected.
   addPass(createUnreachableBlockEliminationPass());
@@ -429,9 +452,11 @@
     // FALLTHROUGH
   case ExceptionHandling::DwarfCFI:
   case ExceptionHandling::ARM:
-  case ExceptionHandling::ItaniumWinEH:
     addPass(createDwarfEHPass(TM));
     break;
+  case ExceptionHandling::WinEH:
+    addPass(createWinEHPass(TM));
+    break;
   case ExceptionHandling::None:
     addPass(createLowerInvokePass());
 
@@ -491,6 +516,8 @@
 /// TODO: We could use a single addPre/Post(ID) hook to allow pass injection
 /// before/after any target-independent pass. But it's currently overkill.
 void TargetPassConfig::addMachinePasses() {
+  AddingMachinePasses = true;
+
   // Insert a machine instr printer pass after the specified pass.
   // If -print-machineinstrs specified, print machineinstrs after all passes.
   if (StringRef(PrintMachineInstrs.getValue()).equals(""))
@@ -499,7 +526,7 @@
            .equals("option-unspecified")) {
     const PassRegistry *PR = PassRegistry::getPassRegistry();
     const PassInfo *TPI = PR->getPassInfo(PrintMachineInstrs.getValue());
-    const PassInfo *IPI = PR->getPassInfo(StringRef("print-machineinstrs"));
+    const PassInfo *IPI = PR->getPassInfo(StringRef("machineinstr-printer"));
     assert (TPI && IPI && "Pass ID not registered!");
     const char *TID = (const char *)(TPI->getTypeInfo());
     const char *IID = (const char *)(IPI->getTypeInfo());
@@ -510,8 +537,7 @@
   printAndVerify("After Instruction Selection");
 
   // Expand pseudo-instructions emitted by ISel.
-  if (addPass(&ExpandISelPseudosID))
-    printAndVerify("After ExpandISelPseudos");
+  addPass(&ExpandISelPseudosID);
 
   // Add passes that optimize machine instructions in SSA form.
   if (getOptLevel() != CodeGenOpt::None) {
@@ -519,12 +545,11 @@
   } else {
     // If the target requests it, assign local variables to stack slots relative
     // to one another and simplify frame index references where possible.
-    addPass(&LocalStackSlotAllocationID);
+    addPass(&LocalStackSlotAllocationID, false);
   }
 
   // Run pre-ra passes.
-  if (addPreRegAlloc())
-    printAndVerify("After PreRegAlloc passes");
+  addPreRegAlloc();
 
   // Run register allocation and passes that are tightly coupled with it,
   // including phi elimination and scheduling.
@@ -534,12 +559,10 @@
     addFastRegAlloc(createRegAllocPass(false));
 
   // Run post-ra passes.
-  if (addPostRegAlloc())
-    printAndVerify("After PostRegAlloc passes");
+  addPostRegAlloc();
 
   // Insert prolog/epilog code.  Eliminate abstract frame index references...
   addPass(&PrologEpilogCodeInserterID);
-  printAndVerify("After PrologEpilogCodeInserter");
 
   /// Add passes that optimize machine instructions after register allocation.
   if (getOptLevel() != CodeGenOpt::None)
@@ -547,11 +570,9 @@
 
   // Expand pseudo instructions before second scheduling pass.
   addPass(&ExpandPostRAPseudosID);
-  printAndVerify("After ExpandPostRAPseudos");
 
   // Run pre-sched2 passes.
-  if (addPreSched2())
-    printAndVerify("After PreSched2 passes");
+  addPreSched2();
 
   // Second pass scheduler.
   if (getOptLevel() != CodeGenOpt::None) {
@@ -559,66 +580,61 @@
       addPass(&PostMachineSchedulerID);
     else
       addPass(&PostRASchedulerID);
-    printAndVerify("After PostRAScheduler");
   }
 
   // GC
   if (addGCPasses()) {
     if (PrintGCInfo)
-      addPass(createGCInfoPrinter(dbgs()));
+      addPass(createGCInfoPrinter(dbgs()), false, false);
   }
 
   // Basic block placement.
   if (getOptLevel() != CodeGenOpt::None)
     addBlockPlacement();
 
-  if (addPreEmitPass())
-    printAndVerify("After PreEmit passes");
+  addPreEmitPass();
 
-  addPass(&StackMapLivenessID);
+  addPass(&StackMapLivenessID, false);
+
+  AddingMachinePasses = false;
 }
 
 /// Add passes that optimize machine instructions in SSA form.
 void TargetPassConfig::addMachineSSAOptimization() {
   // Pre-ra tail duplication.
-  if (addPass(&EarlyTailDuplicateID))
-    printAndVerify("After Pre-RegAlloc TailDuplicate");
+  addPass(&EarlyTailDuplicateID);
 
   // Optimize PHIs before DCE: removing dead PHI cycles may make more
   // instructions dead.
-  addPass(&OptimizePHIsID);
+  addPass(&OptimizePHIsID, false);
 
   // This pass merges large allocas. StackSlotColoring is a different pass
   // which merges spill slots.
-  addPass(&StackColoringID);
+  addPass(&StackColoringID, false);
 
   // If the target requests it, assign local variables to stack slots relative
   // to one another and simplify frame index references where possible.
-  addPass(&LocalStackSlotAllocationID);
+  addPass(&LocalStackSlotAllocationID, false);
 
   // With optimization, dead code should already be eliminated. However
   // there is one known exception: lowered code for arguments that are only
   // used by tail calls, where the tail calls reuse the incoming stack
   // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
   addPass(&DeadMachineInstructionElimID);
-  printAndVerify("After codegen DCE pass");
 
   // Allow targets to insert passes that improve instruction level parallelism,
   // like if-conversion. Such passes will typically need dominator trees and
   // loop info, just like LICM and CSE below.
-  if (addILPOpts())
-    printAndVerify("After ILP optimizations");
+  addILPOpts();
 
-  addPass(&MachineLICMID);
-  addPass(&MachineCSEID);
+  addPass(&MachineLICMID, false);
+  addPass(&MachineCSEID, false);
   addPass(&MachineSinkingID);
-  printAndVerify("After Machine LICM, CSE and Sinking passes");
 
-  addPass(&PeepholeOptimizerID);
+  addPass(&PeepholeOptimizerID, false);
   // Clean-up the dead code that may have been generated by peephole
   // rewriting.
   addPass(&DeadMachineInstructionElimID);
-  printAndVerify("After codegen peephole optimization pass");
 }
 
 //===---------------------------------------------------------------------===//
@@ -701,18 +717,17 @@
 /// Add the minimum set of target-independent passes that are required for
 /// register allocation. No coalescing or scheduling.
 void TargetPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
-  addPass(&PHIEliminationID);
-  addPass(&TwoAddressInstructionPassID);
+  addPass(&PHIEliminationID, false);
+  addPass(&TwoAddressInstructionPassID, false);
 
   addPass(RegAllocPass);
-  printAndVerify("After Register Allocation");
 }
 
 /// Add standard target-independent passes that are tightly coupled with
 /// optimized register allocation, including coalescing, machine instruction
 /// scheduling, and register allocation itself.
 void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
-  addPass(&ProcessImplicitDefsID);
+  addPass(&ProcessImplicitDefsID, false);
 
   // LiveVariables currently requires pure SSA form.
   //
@@ -720,35 +735,30 @@
   // LiveVariables can be removed completely, and LiveIntervals can be directly
   // computed. (We still either need to regenerate kill flags after regalloc, or
   // preferably fix the scavenger to not depend on them).
-  addPass(&LiveVariablesID);
+  addPass(&LiveVariablesID, false);
 
   // Edge splitting is smarter with machine loop info.
-  addPass(&MachineLoopInfoID);
-  addPass(&PHIEliminationID);
+  addPass(&MachineLoopInfoID, false);
+  addPass(&PHIEliminationID, false);
 
   // Eventually, we want to run LiveIntervals before PHI elimination.
   if (EarlyLiveIntervals)
-    addPass(&LiveIntervalsID);
+    addPass(&LiveIntervalsID, false);
 
-  addPass(&TwoAddressInstructionPassID);
+  addPass(&TwoAddressInstructionPassID, false);
   addPass(&RegisterCoalescerID);
-  printAndVerify("After Register Coalescing");
 
   // PreRA instruction scheduling.
-  if (addPass(&MachineSchedulerID))
-    printAndVerify("After Machine Scheduling");
+  addPass(&MachineSchedulerID);
 
   // Add the selected register allocation pass.
   addPass(RegAllocPass);
-  printAndVerify("After Register Allocation, before rewriter");
 
   // Allow targets to change the register assignments before rewriting.
-  if (addPreRewrite())
-    printAndVerify("After pre-rewrite passes");
+  addPreRewrite();
 
   // Finally rewrite virtual registers.
   addPass(&VirtRegRewriterID);
-  printAndVerify("After Virtual Register Rewriter");
 
   // Perform stack slot coloring and post-ra machine LICM.
   //
@@ -760,8 +770,6 @@
   //
   // FIXME: can this move into MachineLateOptimization?
   addPass(&PostRAMachineLICMID);
-
-  printAndVerify("After StackSlotColoring and postra Machine LICM");
 }
 
 //===---------------------------------------------------------------------===//
@@ -771,34 +779,30 @@
 /// Add passes that optimize machine instructions after register allocation.
 void TargetPassConfig::addMachineLateOptimization() {
   // Branch folding must be run after regalloc and prolog/epilog insertion.
-  if (addPass(&BranchFolderPassID))
-    printAndVerify("After BranchFolding");
+  addPass(&BranchFolderPassID);
 
   // Tail duplication.
   // Note that duplicating tail just increases code size and degrades
   // performance for targets that require Structured Control Flow.
   // In addition it can also make CFG irreducible. Thus we disable it.
-  if (!TM->requiresStructuredCFG() && addPass(&TailDuplicateID))
-    printAndVerify("After TailDuplicate");
+  if (!TM->requiresStructuredCFG())
+    addPass(&TailDuplicateID);
 
   // Copy propagation.
-  if (addPass(&MachineCopyPropagationID))
-    printAndVerify("After copy propagation pass");
+  addPass(&MachineCopyPropagationID);
 }
 
 /// Add standard GC passes.
 bool TargetPassConfig::addGCPasses() {
-  addPass(&GCMachineCodeAnalysisID);
+  addPass(&GCMachineCodeAnalysisID, false);
   return true;
 }
 
 /// Add standard basic block placement passes.
 void TargetPassConfig::addBlockPlacement() {
-  if (addPass(&MachineBlockPlacementID)) {
+  if (addPass(&MachineBlockPlacementID, false)) {
     // Run a separate pass to collect block placement statistics.
     if (EnableBlockPlacementStats)
       addPass(&MachineBlockPlacementStatsID);
-
-    printAndVerify("After machine block placement.");
   }
 }

diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index a296aea..283d1f2 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp

@@ -133,7 +133,8 @@
     bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
     bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
                           SmallPtrSetImpl<MachineInstr*> &LocalMIs);
-    bool optimizeSelect(MachineInstr *MI);
+    bool optimizeSelect(MachineInstr *MI,
+                        SmallPtrSetImpl<MachineInstr *> &LocalMIs);
     bool optimizeCondBranch(MachineInstr *MI);
     bool optimizeCopyOrBitcast(MachineInstr *MI);
     bool optimizeCoalescableCopy(MachineInstr *MI);
@@ -482,7 +483,8 @@
 }
 
 /// Optimize a select instruction.
-bool PeepholeOptimizer::optimizeSelect(MachineInstr *MI) {
+bool PeepholeOptimizer::optimizeSelect(MachineInstr *MI,
+                            SmallPtrSetImpl<MachineInstr *> &LocalMIs) {
   unsigned TrueOp = 0;
   unsigned FalseOp = 0;
   bool Optimizable = false;
@@ -491,7 +493,7 @@
     return false;
   if (!Optimizable)
     return false;
-  if (!TII->optimizeSelect(MI))
+  if (!TII->optimizeSelect(MI, LocalMIs))
     return false;
   MI->eraseFromParent();
   ++NumSelects;
@@ -1072,6 +1074,13 @@
     MachineBasicBlock *MBB = &*I;
 
     bool SeenMoveImm = false;
+
+    // During this forward scan, at some point it needs to answer the question
+    // "given a pointer to an MI in the current BB, is it located before or
+    // after the current instruction".
+    // To perform this, the following set keeps track of the MIs already seen
+    // during the scan, if a MI is not in the set, it is assumed to be located
+    // after. Newly created MIs have to be inserted in the set as well.
     SmallPtrSet<MachineInstr*, 16> LocalMIs;
     SmallSet<unsigned, 4> ImmDefRegs;
     DenseMap<unsigned, MachineInstr*> ImmDefMIs;
@@ -1102,7 +1111,7 @@
       if ((isUncoalescableCopy(*MI) &&
            optimizeUncoalescableCopy(MI, LocalMIs)) ||
           (MI->isCompare() && optimizeCmpInstr(MI, MBB)) ||
-          (MI->isSelect() && optimizeSelect(MI))) {
+          (MI->isSelect() && optimizeSelect(MI, LocalMIs))) {
         // MI is deleted.
         LocalMIs.erase(MI);
         Changed = true;

diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 89e1d11..ad59fc9 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp

@@ -282,9 +282,7 @@
   } else {
     // Check that post-RA scheduling is enabled for this target.
     // This may upgrade the AntiDepMode.
-    const TargetSubtargetInfo &ST =
-        Fn.getTarget().getSubtarget<TargetSubtargetInfo>();
-    if (!enablePostRAScheduler(ST, PassConfig->getOptLevel(),
+    if (!enablePostRAScheduler(Fn.getSubtarget(), PassConfig->getOptLevel(),
                                AntiDepMode, CriticalPathRCs))
       return false;
   }

diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 06530b9..6d29b98 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp

@@ -495,7 +495,7 @@
 
       unsigned Align = MFI->getObjectAlignment(i);
       // Adjust to alignment boundary
-      Offset = (Offset+Align-1)/Align*Align;
+      Offset = RoundUpToAlignment(Offset, Align);
 
       MFI->setObjectOffset(i, -Offset);        // Set the computed offset
     }
@@ -504,7 +504,7 @@
     for (int i = MaxCSFI; i >= MinCSFI ; --i) {
       unsigned Align = MFI->getObjectAlignment(i);
       // Adjust to alignment boundary
-      Offset = (Offset+Align-1)/Align*Align;
+      Offset = RoundUpToAlignment(Offset, Align);
 
       MFI->setObjectOffset(i, Offset);
       Offset += MFI->getObjectSize(i);
@@ -537,7 +537,7 @@
     unsigned Align = MFI->getLocalFrameMaxAlign();
 
     // Adjust to alignment boundary.
-    Offset = (Offset + Align - 1) / Align * Align;
+    Offset = RoundUpToAlignment(Offset, Align);
 
     DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
 
@@ -656,8 +656,7 @@
     // If the frame pointer is eliminated, all frame offsets will be relative to
     // SP not FP. Align to MaxAlign so this works.
     StackAlign = std::max(StackAlign, MaxAlign);
-    unsigned AlignMask = StackAlign - 1;
-    Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
+    Offset = RoundUpToAlignment(Offset, StackAlign);
   }
 
   // Update frame info to pretend that this is part of the stack...
@@ -703,7 +702,8 @@
 /// register references and actual offsets.
 ///
 void PEI::replaceFrameIndices(MachineFunction &Fn) {
-  if (!Fn.getFrameInfo()->hasStackObjects()) return; // Nothing to do?
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+  if (!TFI.needsFrameIndexResolution(Fn)) return;
 
   // Store SPAdj at exit of a basic block.
   SmallVector<int, 8> SPState;
@@ -743,26 +743,19 @@
   const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
   const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo();
   const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
-  bool StackGrowsDown =
-    TFI->getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
   int FrameSetupOpcode   = TII.getCallFrameSetupOpcode();
   int FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
 
   if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(BB);
 
+  bool InsideCallSequence = false;
+
   for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
 
     if (I->getOpcode() == FrameSetupOpcode ||
         I->getOpcode() == FrameDestroyOpcode) {
-      // Remember how much SP has been adjusted to create the call
-      // frame.
-      int Size = I->getOperand(0).getImm();
-
-      if ((!StackGrowsDown && I->getOpcode() == FrameSetupOpcode) ||
-          (StackGrowsDown && I->getOpcode() == FrameDestroyOpcode))
-        Size = -Size;
-
-      SPAdj += Size;
+      InsideCallSequence = (I->getOpcode() == FrameSetupOpcode);
+      SPAdj += TII.getSPAdjust(I);
 
       MachineBasicBlock::iterator PrevI = BB->end();
       if (I != BB->begin()) PrevI = std::prev(I);
@@ -797,6 +790,37 @@
         continue;
       }
 
+      // TODO: This code should be commoned with the code for
+      // PATCHPOINT. There's no good reason for the difference in
+      // implementation other than historical accident.  The only
+      // remaining difference is the unconditional use of the stack
+      // pointer as the base register.
+      if (MI->getOpcode() == TargetOpcode::STATEPOINT) {
+        assert((!MI->isDebugValue() || i == 0) &&
+               "Frame indicies can only appear as the first operand of a "
+               "DBG_VALUE machine instruction");
+        unsigned Reg;
+        MachineOperand &Offset = MI->getOperand(i + 1);
+        const unsigned refOffset =
+          TFI->getFrameIndexReferenceFromSP(Fn, MI->getOperand(i).getIndex(),
+                                            Reg);
+
+        Offset.setImm(Offset.getImm() + refOffset);
+        MI->getOperand(i).ChangeToRegister(Reg, false /*isDef*/);
+        continue;
+      }
+
+      // Frame allocations are target independent. Simply swap the index with
+      // the offset.
+      if (MI->getOpcode() == TargetOpcode::FRAME_ALLOC) {
+        assert(TFI->hasFP(Fn) && "frame alloc requires FP");
+        MachineOperand &FI = MI->getOperand(i);
+        unsigned Reg;
+        int FrameOffset = TFI->getFrameIndexReference(Fn, FI.getIndex(), Reg);
+        FI.ChangeToImmediate(FrameOffset);
+        continue;
+      }
+
       // Some instructions (e.g. inline asm instructions) can have
       // multiple frame indices and/or cause eliminateFrameIndex
       // to insert more than one instruction. We need the register
@@ -823,6 +847,16 @@
       break;
     }
 
+    // If we are looking at a call sequence, we need to keep track of
+    // the SP adjustment made by each instruction in the sequence.
+    // This includes both the frame setup/destroy pseudos (handled above),
+    // as well as other instructions that have side effects w.r.t the SP.
+    // Note that this must come after eliminateFrameIndex, because 
+    // if I itself referred to a frame index, we shouldn't count its own
+    // adjustment.
+    if (MI && InsideCallSequence)
+      SPAdj += TII.getSPAdjust(MI);
+
     if (DoIncr && I != BB->end()) ++I;
 
     // Update register states.

diff --git a/lib/CodeGen/RegAllocBase.cpp b/lib/CodeGen/RegAllocBase.cpp
index 122afd1..6b346f4 100644
--- a/lib/CodeGen/RegAllocBase.cpp
+++ b/lib/CodeGen/RegAllocBase.cpp

@@ -90,6 +90,7 @@
     // Unused registers can appear when the spiller coalesces snippets.
     if (MRI->reg_nodbg_empty(VirtReg->reg)) {
       DEBUG(dbgs() << "Dropping unused " << *VirtReg << '\n');
+      aboutToRemoveInterval(*VirtReg);
       LIS->removeInterval(VirtReg->reg);
       continue;
     }
@@ -139,6 +140,7 @@
       assert(!VRM->hasPhys(SplitVirtReg->reg) && "Register already assigned");
       if (MRI->reg_nodbg_empty(SplitVirtReg->reg)) {
         DEBUG(dbgs() << "not queueing unused  " << *SplitVirtReg << '\n');
+        aboutToRemoveInterval(*SplitVirtReg);
         LIS->removeInterval(SplitVirtReg->reg);
         continue;
       }

diff --git a/lib/CodeGen/RegAllocBase.h b/lib/CodeGen/RegAllocBase.h
index bbd79cd..659b8f5 100644
--- a/lib/CodeGen/RegAllocBase.h
+++ b/lib/CodeGen/RegAllocBase.h

@@ -96,6 +96,9 @@
   // Use this group name for NamedRegionTimer.
   static const char TimerGroupName[];
 
+  /// Method called when the allocator is about to remove a LiveInterval.
+  virtual void aboutToRemoveInterval(LiveInterval &LI) {}
+
 public:
   /// VerifyEnabled - True when -verify-regalloc is given.
   static bool VerifyEnabled;

diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 8fc10b4..c621414 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp

@@ -372,15 +372,23 @@
     case regDisabled:
       break;
     case regReserved:
-      assert(TRI->isSuperRegister(PhysReg, Alias) &&
+      // Either PhysReg is a subregister of Alias and we mark the
+      // whole register as free, or PhysReg is the superregister of
+      // Alias and we mark all the aliases as disabled before freeing
+      // PhysReg.
+      // In the latter case, since PhysReg was disabled, this means that
+      // its value is defined only by physical sub-registers. This check
+      // is performed by the assert of the default case in this loop.
+      // Note: The value of the superregister may only be partial
+      // defined, that is why regDisabled is a valid state for aliases.
+      assert((TRI->isSuperRegister(PhysReg, Alias) ||
+              TRI->isSuperRegister(Alias, PhysReg)) &&
              "Instruction is not using a subregister of a reserved register");
-      // Leave the superregister in the working set.
-      PhysRegState[Alias] = regFree;
-      MO.getParent()->addRegisterKilled(Alias, TRI, true);
-      return;
+      // Fall through.
     case regFree:
       if (TRI->isSuperRegister(PhysReg, Alias)) {
         // Leave the superregister in the working set.
+        PhysRegState[Alias] = regFree;
         MO.getParent()->addRegisterKilled(Alias, TRI, true);
         return;
       }
@@ -1023,8 +1031,7 @@
 
       if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
         if (!MRI->isAllocatable(Reg)) continue;
-        definePhysReg(MI, Reg, (MO.isImplicit() || MO.isDead()) ?
-                               regFree : regReserved);
+        definePhysReg(MI, Reg, MO.isDead() ? regFree : regReserved);
         continue;
       }
       LiveRegMap::iterator LRI = defineVirtReg(MI, i, Reg, CopySrc);

diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 8ef5dcd..edc3294 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp

@@ -296,6 +296,9 @@
   /// obtained from the TargetSubtargetInfo.
   bool EnableLocalReassign;
 
+  /// Set of broken hints that may be reconciled later because of eviction.
+  SmallSetVector<LiveInterval *, 8> SetOfBrokenHints;
+
 public:
   RAGreedy();
 
@@ -311,6 +314,7 @@
   void enqueue(LiveInterval *LI) override;
   LiveInterval *dequeue() override;
   unsigned selectOrSplit(LiveInterval&, SmallVectorImpl<unsigned>&) override;
+  void aboutToRemoveInterval(LiveInterval &) override;
 
   /// Perform register allocation.
   bool runOnMachineFunction(MachineFunction &mf) override;
@@ -378,6 +382,24 @@
                                    SmallVirtRegSet &, unsigned);
   bool tryRecoloringCandidates(PQueue &, SmallVectorImpl<unsigned> &,
                                SmallVirtRegSet &, unsigned);
+  void tryHintRecoloring(LiveInterval &);
+  void tryHintsRecoloring();
+
+  /// Model the information carried by one end of a copy.
+  struct HintInfo {
+    /// The frequency of the copy.
+    BlockFrequency Freq;
+    /// The virtual register or physical register.
+    unsigned Reg;
+    /// Its currently assigned register.
+    /// In case of a physical register Reg == PhysReg.
+    unsigned PhysReg;
+    HintInfo(BlockFrequency Freq, unsigned Reg, unsigned PhysReg)
+        : Freq(Freq), Reg(Reg), PhysReg(PhysReg) {}
+  };
+  typedef SmallVector<HintInfo, 4> HintsInfo;
+  BlockFrequency getBrokenHintFreq(const HintsInfo &, unsigned);
+  void collectHintInfo(unsigned, HintsInfo &);
 };
 } // end anonymous namespace
 
@@ -453,7 +475,9 @@
 
 bool RAGreedy::LRE_CanEraseVirtReg(unsigned VirtReg) {
   if (VRM->hasPhys(VirtReg)) {
-    Matrix->unassign(LIS->getInterval(VirtReg));
+    LiveInterval &LI = LIS->getInterval(VirtReg);
+    Matrix->unassign(LI);
+    aboutToRemoveInterval(LI);
     return true;
   }
   // Unassigned virtreg is probably in the priority queue.
@@ -2213,6 +2237,11 @@
   return PhysReg;
 }
 
+void RAGreedy::aboutToRemoveInterval(LiveInterval &LI) {
+  // Do not keep invalid information around.
+  SetOfBrokenHints.remove(&LI);
+}
+
 void RAGreedy::initializeCSRCost() {
   // We use the larger one out of the command-line option and the value report
   // by TRI.
@@ -2238,6 +2267,170 @@
     CSRCost = CSRCost.getFrequency() * (ActualEntry / FixedEntry);
 }
 
+/// \brief Collect the hint info for \p Reg.
+/// The results are stored into \p Out.
+/// \p Out is not cleared before being populated.
+void RAGreedy::collectHintInfo(unsigned Reg, HintsInfo &Out) {
+  for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) {
+    if (!Instr.isFullCopy())
+      continue;
+    // Look for the other end of the copy.
+    unsigned OtherReg = Instr.getOperand(0).getReg();
+    if (OtherReg == Reg) {
+      OtherReg = Instr.getOperand(1).getReg();
+      if (OtherReg == Reg)
+        continue;
+    }
+    // Get the current assignment.
+    unsigned OtherPhysReg = TargetRegisterInfo::isPhysicalRegister(OtherReg)
+                                ? OtherReg
+                                : VRM->getPhys(OtherReg);
+    // Push the collected information.
+    Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg,
+                           OtherPhysReg));
+  }
+}
+
+/// \brief Using the given \p List, compute the cost of the broken hints if
+/// \p PhysReg was used.
+/// \return The cost of \p List for \p PhysReg.
+BlockFrequency RAGreedy::getBrokenHintFreq(const HintsInfo &List,
+                                           unsigned PhysReg) {
+  BlockFrequency Cost = 0;
+  for (const HintInfo &Info : List) {
+    if (Info.PhysReg != PhysReg)
+      Cost += Info.Freq;
+  }
+  return Cost;
+}
+
+/// \brief Using the register assigned to \p VirtReg, try to recolor
+/// all the live ranges that are copy-related with \p VirtReg.
+/// The recoloring is then propagated to all the live-ranges that have
+/// been recolored and so on, until no more copies can be coalesced or
+/// it is not profitable.
+/// For a given live range, profitability is determined by the sum of the
+/// frequencies of the non-identity copies it would introduce with the old
+/// and new register.
+void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
+  // We have a broken hint, check if it is possible to fix it by
+  // reusing PhysReg for the copy-related live-ranges. Indeed, we evicted
+  // some register and PhysReg may be available for the other live-ranges.
+  SmallSet<unsigned, 4> Visited;
+  SmallVector<unsigned, 2> RecoloringCandidates;
+  HintsInfo Info;
+  unsigned Reg = VirtReg.reg;
+  unsigned PhysReg = VRM->getPhys(Reg);
+  // Start the recoloring algorithm from the input live-interval, then
+  // it will propagate to the ones that are copy-related with it.
+  Visited.insert(Reg);
+  RecoloringCandidates.push_back(Reg);
+
+  DEBUG(dbgs() << "Trying to reconcile hints for: " << PrintReg(Reg, TRI) << '('
+               << PrintReg(PhysReg, TRI) << ")\n");
+
+  do {
+    Reg = RecoloringCandidates.pop_back_val();
+
+    // We cannot recolor physcal register.
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+
+    assert(VRM->hasPhys(Reg) && "We have unallocated variable!!");
+
+    // Get the live interval mapped with this virtual register to be able
+    // to check for the interference with the new color.
+    LiveInterval &LI = LIS->getInterval(Reg);
+    unsigned CurrPhys = VRM->getPhys(Reg);
+    // Check that the new color matches the register class constraints and
+    // that it is free for this live range.
+    if (CurrPhys != PhysReg && (!MRI->getRegClass(Reg)->contains(PhysReg) ||
+                                Matrix->checkInterference(LI, PhysReg)))
+      continue;
+
+    DEBUG(dbgs() << PrintReg(Reg, TRI) << '(' << PrintReg(CurrPhys, TRI)
+                 << ") is recolorable.\n");
+
+    // Gather the hint info.
+    Info.clear();
+    collectHintInfo(Reg, Info);
+    // Check if recoloring the live-range will increase the cost of the
+    // non-identity copies.
+    if (CurrPhys != PhysReg) {
+      DEBUG(dbgs() << "Checking profitability:\n");
+      BlockFrequency OldCopiesCost = getBrokenHintFreq(Info, CurrPhys);
+      BlockFrequency NewCopiesCost = getBrokenHintFreq(Info, PhysReg);
+      DEBUG(dbgs() << "Old Cost: " << OldCopiesCost.getFrequency()
+                   << "\nNew Cost: " << NewCopiesCost.getFrequency() << '\n');
+      if (OldCopiesCost < NewCopiesCost) {
+        DEBUG(dbgs() << "=> Not profitable.\n");
+        continue;
+      }
+      // At this point, the cost is either cheaper or equal. If it is
+      // equal, we consider this is profitable because it may expose
+      // more recoloring opportunities.
+      DEBUG(dbgs() << "=> Profitable.\n");
+      // Recolor the live-range.
+      Matrix->unassign(LI);
+      Matrix->assign(LI, PhysReg);
+    }
+    // Push all copy-related live-ranges to keep reconciling the broken
+    // hints.
+    for (const HintInfo &HI : Info) {
+      if (Visited.insert(HI.Reg).second)
+        RecoloringCandidates.push_back(HI.Reg);
+    }
+  } while (!RecoloringCandidates.empty());
+}
+
+/// \brief Try to recolor broken hints.
+/// Broken hints may be repaired by recoloring when an evicted variable
+/// freed up a register for a larger live-range.
+/// Consider the following example:
+/// BB1:
+///   a =
+///   b =
+/// BB2:
+///   ...
+///   = b
+///   = a
+/// Let us assume b gets split:
+/// BB1:
+///   a =
+///   b =
+/// BB2:
+///   c = b
+///   ...
+///   d = c
+///   = d
+///   = a
+/// Because of how the allocation work, b, c, and d may be assigned different
+/// colors. Now, if a gets evicted later:
+/// BB1:
+///   a =
+///   st a, SpillSlot
+///   b =
+/// BB2:
+///   c = b
+///   ...
+///   d = c
+///   = d
+///   e = ld SpillSlot
+///   = e
+/// This is likely that we can assign the same register for b, c, and d,
+/// getting rid of 2 copies.
+void RAGreedy::tryHintsRecoloring() {
+  for (LiveInterval *LI : SetOfBrokenHints) {
+    assert(TargetRegisterInfo::isVirtualRegister(LI->reg) &&
+           "Recoloring is possible only for virtual registers");
+    // Some dead defs may be around (e.g., because of debug uses).
+    // Ignore those.
+    if (!VRM->hasPhys(LI->reg))
+      continue;
+    tryHintRecoloring(*LI);
+  }
+}
+
 unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
                                      SmallVectorImpl<unsigned> &NewVRegs,
                                      SmallVirtRegSet &FixedRegisters,
@@ -2274,8 +2467,18 @@
   // queue. The RS_Split ranges already failed to do this, and they should not
   // get a second chance until they have been split.
   if (Stage != RS_Split)
-    if (unsigned PhysReg = tryEvict(VirtReg, Order, NewVRegs, CostPerUseLimit))
+    if (unsigned PhysReg =
+            tryEvict(VirtReg, Order, NewVRegs, CostPerUseLimit)) {
+      unsigned Hint = MRI->getSimpleHint(VirtReg.reg);
+      // If VirtReg has a hint and that hint is broken record this
+      // virtual register as a recoloring candidate for broken hint.
+      // Indeed, since we evicted a variable in its neighborhood it is
+      // likely we can at least partially recolor some of the
+      // copy-related live-ranges.
+      if (Hint && Hint != PhysReg)
+        SetOfBrokenHints.insert(&VirtReg);
       return PhysReg;
+    }
 
   assert(NewVRegs.empty() && "Cannot append to existing NewVRegs");
 
@@ -2355,8 +2558,10 @@
   NextCascade = 1;
   IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI);
   GlobalCand.resize(32);  // This will grow as needed.
+  SetOfBrokenHints.clear();
 
   allocatePhysRegs();
+  tryHintsRecoloring();
   releaseMemory();
   return true;
 }

diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index eb7e5633..77a42b3 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp

@@ -126,7 +126,12 @@
   void findVRegIntervalsToAlloc(const MachineFunction &MF, LiveIntervals &LIS);
 
   /// \brief Constructs an initial graph.
-  void initializeGraph(PBQPRAGraph &G);
+  void initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM, Spiller &VRegSpiller);
+
+  /// \brief Spill the given VReg.
+  void spillVReg(unsigned VReg, SmallVectorImpl<unsigned> &NewIntervals,
+                 MachineFunction &MF, LiveIntervals &LIS, VirtRegMap &VRM,
+                 Spiller &VRegSpiller);
 
   /// \brief Given a solved PBQP problem maps this solution back to a register
   /// assignment.
@@ -172,8 +177,6 @@
 class Interference : public PBQPRAConstraint {
 private:
 
-private:
-
   typedef const PBQP::RegAlloc::AllowedRegVector* AllowedRegVecPtr;
   typedef std::pair<AllowedRegVecPtr, AllowedRegVecPtr> IMatrixKey;
   typedef DenseMap<IMatrixKey, PBQPRAGraph::MatrixPtr> IMatrixCache;
@@ -308,7 +311,7 @@
                               PBQPRAGraph::NodeId MId, IMatrixCache &C) {
 
     const TargetRegisterInfo &TRI =
-      *G.getMetadata().MF.getTarget().getSubtargetImpl()->getRegisterInfo();
+        *G.getMetadata().MF.getSubtarget().getRegisterInfo();
 
     const auto &NRegs = G.getNodeMetadata(NId).getAllowedRegs();
     const auto &MRegs = G.getNodeMetadata(MId).getAllowedRegs();
@@ -342,7 +345,7 @@
   void apply(PBQPRAGraph &G) override {
     MachineFunction &MF = G.getMetadata().MF;
     MachineBlockFrequencyInfo &MBFI = G.getMetadata().MBFI;
-    CoalescerPair CP(*MF.getTarget().getSubtargetImpl()->getRegisterInfo());
+    CoalescerPair CP(*MF.getSubtarget().getRegisterInfo());
 
     // Scan the machine function and add a coalescing cost whenever CoalescerPair
     // gives the Ok.
@@ -398,7 +401,7 @@
             }
             PBQPRAGraph::RawMatrix Costs(G.getEdgeCosts(EId));
             addVirtRegCoalesce(Costs, *Allowed1, *Allowed2, CBenefit);
-            G.setEdgeCosts(EId, std::move(Costs));
+            G.updateEdgeCosts(EId, std::move(Costs));
           }
         }
       }
@@ -488,15 +491,21 @@
   return false;
 }
 
-void RegAllocPBQP::initializeGraph(PBQPRAGraph &G) {
+void RegAllocPBQP::initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM,
+                                   Spiller &VRegSpiller) {
   MachineFunction &MF = G.getMetadata().MF;
 
   LiveIntervals &LIS = G.getMetadata().LIS;
   const MachineRegisterInfo &MRI = G.getMetadata().MF.getRegInfo();
   const TargetRegisterInfo &TRI =
-    *G.getMetadata().MF.getTarget().getSubtargetImpl()->getRegisterInfo();
+      *G.getMetadata().MF.getSubtarget().getRegisterInfo();
 
-  for (auto VReg : VRegsToAlloc) {
+  std::vector<unsigned> Worklist(VRegsToAlloc.begin(), VRegsToAlloc.end());
+
+  while (!Worklist.empty()) {
+    unsigned VReg = Worklist.back();
+    Worklist.pop_back();
+
     const TargetRegisterClass *TRC = MRI.getRegClass(VReg);
     LiveInterval &VRegLI = LIS.getInterval(VReg);
 
@@ -531,6 +540,15 @@
       VRegAllowed.push_back(PReg);
     }
 
+    // Check for vregs that have no allowed registers. These should be
+    // pre-spilled and the new vregs added to the worklist.
+    if (VRegAllowed.empty()) {
+      SmallVector<unsigned, 8> NewVRegs;
+      spillVReg(VReg, NewVRegs, MF, LIS, VRM, VRegSpiller);
+      Worklist.insert(Worklist.end(), NewVRegs.begin(), NewVRegs.end());
+      continue;
+    }
+
     PBQPRAGraph::RawVector NodeCosts(VRegAllowed.size() + 1, 0);
 
     // Tweak cost of callee saved registers, as using then force spilling and
@@ -547,14 +565,40 @@
   }
 }
 
+void RegAllocPBQP::spillVReg(unsigned VReg,
+                             SmallVectorImpl<unsigned> &NewIntervals,
+                             MachineFunction &MF, LiveIntervals &LIS,
+                             VirtRegMap &VRM, Spiller &VRegSpiller) {
+
+  VRegsToAlloc.erase(VReg);
+  LiveRangeEdit LRE(&LIS.getInterval(VReg), NewIntervals, MF, LIS, &VRM);
+  VRegSpiller.spill(LRE);
+
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  (void)TRI;
+  DEBUG(dbgs() << "VREG " << PrintReg(VReg, &TRI) << " -> SPILLED (Cost: "
+               << LRE.getParent().weight << ", New vregs: ");
+
+  // Copy any newly inserted live intervals into the list of regs to
+  // allocate.
+  for (LiveRangeEdit::iterator I = LRE.begin(), E = LRE.end();
+       I != E; ++I) {
+    const LiveInterval &LI = LIS.getInterval(*I);
+    assert(!LI.empty() && "Empty spill range.");
+    DEBUG(dbgs() << PrintReg(LI.reg, &TRI) << " ");
+    VRegsToAlloc.insert(LI.reg);
+  }
+
+  DEBUG(dbgs() << ")\n");
+}
+
 bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAGraph &G,
                                      const PBQP::Solution &Solution,
                                      VirtRegMap &VRM,
                                      Spiller &VRegSpiller) {
   MachineFunction &MF = G.getMetadata().MF;
   LiveIntervals &LIS = G.getMetadata().LIS;
-  const TargetRegisterInfo &TRI =
-    *MF.getTarget().getSubtargetImpl()->getRegisterInfo();
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   (void)TRI;
 
   // Set to true if we have any spills
@@ -576,28 +620,11 @@
       assert(PReg != 0 && "Invalid preg selected.");
       VRM.assignVirt2Phys(VReg, PReg);
     } else {
-      VRegsToAlloc.erase(VReg);
-      SmallVector<unsigned, 8> NewSpills;
-      LiveRangeEdit LRE(&LIS.getInterval(VReg), NewSpills, MF, LIS, &VRM);
-      VRegSpiller.spill(LRE);
-
-      DEBUG(dbgs() << "VREG " << PrintReg(VReg, &TRI) << " -> SPILLED (Cost: "
-                   << LRE.getParent().weight << ", New vregs: ");
-
-      // Copy any newly inserted live intervals into the list of regs to
-      // allocate.
-      for (LiveRangeEdit::iterator I = LRE.begin(), E = LRE.end();
-           I != E; ++I) {
-        LiveInterval &LI = LIS.getInterval(*I);
-        assert(!LI.empty() && "Empty spill range.");
-        DEBUG(dbgs() << PrintReg(LI.reg, &TRI) << " ");
-        VRegsToAlloc.insert(LI.reg);
-      }
-
-      DEBUG(dbgs() << ")\n");
-
-      // We need another round if spill intervals were added.
-      AnotherRoundNeeded |= !LRE.empty();
+      // Spill VReg. If this introduces new intervals we'll need another round
+      // of allocation.
+      SmallVector<unsigned, 8> NewVRegs;
+      spillVReg(VReg, NewVRegs, MF, LIS, VRM, VRegSpiller);
+      AnotherRoundNeeded |= !NewVRegs.empty();
     }
   }
 
@@ -670,7 +697,7 @@
   // If there are non-empty intervals allocate them using pbqp.
   if (!VRegsToAlloc.empty()) {
 
-    const TargetSubtargetInfo &Subtarget = *MF.getTarget().getSubtargetImpl();
+    const TargetSubtargetInfo &Subtarget = MF.getSubtarget();
     std::unique_ptr<PBQPRAConstraintList> ConstraintsRoot =
       llvm::make_unique<PBQPRAConstraintList>();
     ConstraintsRoot->addConstraint(llvm::make_unique<SpillCosts>());
@@ -686,7 +713,7 @@
       DEBUG(dbgs() << "  PBQP Regalloc round " << Round << ":\n");
 
       PBQPRAGraph G(PBQPRAGraph::GraphMetadata(MF, LIS, MBFI));
-      initializeGraph(G);
+      initializeGraph(G, VRM, *VRegSpiller);
       ConstraintsRoot->apply(G);
 
 #ifndef NDEBUG
@@ -699,7 +726,7 @@
         raw_fd_ostream OS(GraphFileName, EC, sys::fs::F_Text);
         DEBUG(dbgs() << "Dumping graph for round " << Round << " to \""
               << GraphFileName << "\"\n");
-        G.dumpToStream(OS);
+        G.dump(OS);
       }
 #endif
 
@@ -719,6 +746,79 @@
   return true;
 }
 
+namespace {
+// A helper class for printing node and register info in a consistent way
+class PrintNodeInfo {
+public:
+  typedef PBQP::RegAlloc::PBQPRAGraph Graph;
+  typedef PBQP::RegAlloc::PBQPRAGraph::NodeId NodeId;
+
+  PrintNodeInfo(NodeId NId, const Graph &G) : G(G), NId(NId) {}
+
+  void print(raw_ostream &OS) const {
+    const MachineRegisterInfo &MRI = G.getMetadata().MF.getRegInfo();
+    const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+    unsigned VReg = G.getNodeMetadata(NId).getVReg();
+    const char *RegClassName = TRI->getRegClassName(MRI.getRegClass(VReg));
+    OS << NId << " (" << RegClassName << ':' << PrintReg(VReg, TRI) << ')';
+  }
+
+private:
+  const Graph &G;
+  NodeId NId;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const PrintNodeInfo &PR) {
+  PR.print(OS);
+  return OS;
+}
+} // anonymous namespace
+
+void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const {
+  for (auto NId : nodeIds()) {
+    const Vector &Costs = getNodeCosts(NId);
+    assert(Costs.getLength() != 0 && "Empty vector in graph.");
+    OS << PrintNodeInfo(NId, *this) << ": " << Costs << '\n';
+  }
+  OS << '\n';
+
+  for (auto EId : edgeIds()) {
+    NodeId N1Id = getEdgeNode1Id(EId);
+    NodeId N2Id = getEdgeNode2Id(EId);
+    assert(N1Id != N2Id && "PBQP graphs should not have self-edges.");
+    const Matrix &M = getEdgeCosts(EId);
+    assert(M.getRows() != 0 && "No rows in matrix.");
+    assert(M.getCols() != 0 && "No cols in matrix.");
+    OS << PrintNodeInfo(N1Id, *this) << ' ' << M.getRows() << " rows / ";
+    OS << PrintNodeInfo(N2Id, *this) << ' ' << M.getCols() << " cols:\n";
+    OS << M << '\n';
+  }
+}
+
+void PBQP::RegAlloc::PBQPRAGraph::dump() const { dump(dbgs()); }
+
+void PBQP::RegAlloc::PBQPRAGraph::printDot(raw_ostream &OS) const {
+  OS << "graph {\n";
+  for (auto NId : nodeIds()) {
+    OS << "  node" << NId << " [ label=\""
+       << PrintNodeInfo(NId, *this) << "\\n"
+       << getNodeCosts(NId) << "\" ]\n";
+  }
+
+  OS << "  edge [ len=" << nodeIds().size() << " ]\n";
+  for (auto EId : edgeIds()) {
+    OS << "  node" << getEdgeNode1Id(EId)
+       << " -- node" << getEdgeNode2Id(EId)
+       << " [ label=\"";
+    const Matrix &EdgeCosts = getEdgeCosts(EId);
+    for (unsigned i = 0; i < EdgeCosts.getRows(); ++i) {
+      OS << EdgeCosts.getRowAsVector(i) << "\\n";
+    }
+    OS << "\" ]\n";
+  }
+  OS << "}\n";
+}
+
 FunctionPass *llvm::createPBQPRegisterAllocator(char *customPassID) {
   return new RegAllocPBQP(customPassID);
 }

diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp
index e0d1aa2..ab33672 100644
--- a/lib/CodeGen/RegisterClassInfo.cpp
+++ b/lib/CodeGen/RegisterClassInfo.cpp

@@ -47,6 +47,7 @@
   }
 
   // Does this MF have different CSRs?
+  assert(TRI && "no register info set");
   const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF);
   if (Update || CSR != CalleeSaved) {
     // Build a CSRNum map. Every CSR alias gets an entry pointing to the last
@@ -76,6 +77,7 @@
 /// registers filtered out. Volatile registers come first followed by CSR
 /// aliases ordered according to the CSR order specified by the target.
 void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
+  assert(RC && "no register class given");
   RCInfo &RCI = RegClass[RC->getID()];
 
   // Raw register count, including all reserved regs.

diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 2d2dc92..1e4cfe8 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp

@@ -32,6 +32,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -57,12 +58,12 @@
               cl::desc("Coalesce copies (default=true)"),
               cl::init(true));
 
-// Temporary flag to test critical edge unsplitting.
+/// Temporary flag to test critical edge unsplitting.
 static cl::opt<bool>
 EnableJoinSplits("join-splitedges",
   cl::desc("Coalesce copies on split edges (default=subtarget)"), cl::Hidden);
 
-// Temporary flag to test global copy optimization.
+/// Temporary flag to test global copy optimization.
 static cl::opt<cl::boolOrDefault>
 EnableGlobalCopies("join-globalcopies",
   cl::desc("Coalesce copies that span blocks (default=subtarget)"),
@@ -86,6 +87,14 @@
     AliasAnalysis *AA;
     RegisterClassInfo RegClassInfo;
 
+    /// A LaneMask to remember on which subregister live ranges we need to call
+    /// shrinkToUses() later.
+    unsigned ShrinkMask;
+
+    /// True if the main range of the currently coalesced intervals should be
+    /// checked for smaller live intervals.
+    bool ShrinkMainRange;
+
     /// \brief True if the coalescer should aggressively coalesce global copies
     /// in favor of keeping local copies.
     bool JoinGlobalCopies;
@@ -111,7 +120,7 @@
     /// Recursively eliminate dead defs in DeadDefs.
     void eliminateDeadDefs();
 
-    /// LiveRangeEdit callback.
+    /// LiveRangeEdit callback for eliminateDeadDefs().
     void LRE_WillEraseInstruction(MachineInstr *MI) override;
 
     /// Coalesce the LocalWorkList.
@@ -124,16 +133,15 @@
     /// copies that cannot yet be coalesced into WorkList.
     void copyCoalesceInMBB(MachineBasicBlock *MBB);
 
-    /// Try to coalesce all copies in CurrList. Return
-    /// true if any progress was made.
+    /// Tries to coalesce all copies in CurrList. Returns true if any progress
+    /// was made.
     bool copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList);
 
-    /// Attempt to join intervals corresponding to SrcReg/DstReg,
-    /// which are the src/dst of the copy instruction CopyMI.  This returns
-    /// true if the copy was successfully coalesced away. If it is not
-    /// currently possible to coalesce this interval, but it may be possible if
-    /// other things get coalesced, then it returns true by reference in
-    /// 'Again'.
+    /// Attempt to join intervals corresponding to SrcReg/DstReg, which are the
+    /// src/dst of the copy instruction CopyMI.  This returns true if the copy
+    /// was successfully coalesced away. If it is not currently possible to
+    /// coalesce this interval, but it may be possible if other things get
+    /// coalesced, then it returns true by reference in 'Again'.
     bool joinCopy(MachineInstr *TheCopy, bool &Again);
 
     /// Attempt to join these two intervals.  On failure, this
@@ -147,10 +155,23 @@
     /// Attempt joining with a reserved physreg.
     bool joinReservedPhysReg(CoalescerPair &CP);
 
-    /// We found a non-trivially-coalescable copy. If
-    /// the source value number is defined by a copy from the destination reg
-    /// see if we can merge these two destination reg valno# into a single
-    /// value number, eliminating a copy.
+    /// Add the LiveRange @p ToMerge as a subregister liverange of @p LI.
+    /// Subranges in @p LI which only partially interfere with the desired
+    /// LaneMask are split as necessary. @p LaneMask are the lanes that
+    /// @p ToMerge will occupy in the coalescer register. @p LI has its subrange
+    /// lanemasks already adjusted to the coalesced register.
+    void mergeSubRangeInto(LiveInterval &LI, const LiveRange &ToMerge,
+                           unsigned LaneMask, CoalescerPair &CP);
+
+    /// Join the liveranges of two subregisters. Joins @p RRange into
+    /// @p LRange, @p RRange may be invalid afterwards.
+    void joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
+                          unsigned LaneMask, const CoalescerPair &CP);
+
+    /// We found a non-trivially-coalescable copy. If the source value number is
+    /// defined by a copy from the destination reg see if we can merge these two
+    /// destination reg valno# into a single value number, eliminating a copy.
+    /// This returns true if an interval was modified.
     bool adjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
 
     /// Return true if there are definitions of IntB
@@ -162,6 +183,7 @@
     /// If the source value number is defined by a commutable instruction and
     /// its other operand is coalesced to the copy dest register, see if we
     /// can transform the copy into a noop by commuting the definition.
+    /// This returns true if an interval was modified.
     bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
 
     /// If the source of a copy is defined by a
@@ -169,21 +191,21 @@
     bool reMaterializeTrivialDef(CoalescerPair &CP, MachineInstr *CopyMI,
                                  bool &IsDefCopy);
 
-    /// Return true if a physreg copy should be joined.
+    /// Return true if a copy involving a physreg should be joined.
     bool canJoinPhys(const CoalescerPair &CP);
 
-    /// Replace all defs and uses of SrcReg to DstReg and
-    /// update the subregister number if it is not zero. If DstReg is a
-    /// physical register and the existing subregister number of the def / use
-    /// being updated is not zero, make sure to set it to the correct physical
-    /// subregister.
+    /// Replace all defs and uses of SrcReg to DstReg and update the subregister
+    /// number if it is not zero. If DstReg is a physical register and the
+    /// existing subregister number of the def / use being updated is not zero,
+    /// make sure to set it to the correct physical subregister.
     void updateRegDefsUses(unsigned SrcReg, unsigned DstReg, unsigned SubIdx);
 
     /// Handle copies of undef values.
-    bool eliminateUndefCopy(MachineInstr *CopyMI, const CoalescerPair &CP);
+    /// Returns true if @p CopyMI was a copy of an undef value and eliminated.
+    bool eliminateUndefCopy(MachineInstr *CopyMI);
 
   public:
-    static char ID; // Class identification, replacement for typeinfo
+    static char ID; ///< Class identification, replacement for typeinfo
     RegisterCoalescer() : MachineFunctionPass(ID) {
       initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
     }
@@ -198,7 +220,7 @@
     /// Implement the dump method.
     void print(raw_ostream &O, const Module* = nullptr) const override;
   };
-} /// end anonymous namespace
+} // end anonymous namespace
 
 char &llvm::RegisterCoalescerID = RegisterCoalescer::ID;
 
@@ -232,11 +254,11 @@
   return true;
 }
 
-// Return true if this block should be vacated by the coalescer to eliminate
-// branches. The important cases to handle in the coalescer are critical edges
-// split during phi elimination which contain only copies. Simple blocks that
-// contain non-branches should also be vacated, but this can be handled by an
-// earlier pass similar to early if-conversion.
+/// Return true if this block should be vacated by the coalescer to eliminate
+/// branches. The important cases to handle in the coalescer are critical edges
+/// split during phi elimination which contain only copies. Simple blocks that
+/// contain non-branches should also be vacated, but this can be handled by an
+/// earlier pass similar to early if-conversion.
 static bool isSplitEdge(const MachineBasicBlock *MBB) {
   if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
     return false;
@@ -401,27 +423,11 @@
                 nullptr, this).eliminateDeadDefs(DeadDefs);
 }
 
-// Callback from eliminateDeadDefs().
 void RegisterCoalescer::LRE_WillEraseInstruction(MachineInstr *MI) {
   // MI may be in WorkList. Make sure we don't visit it.
   ErasedInstrs.insert(MI);
 }
 
-/// We found a non-trivially-coalescable copy with IntA
-/// being the source and IntB being the dest, thus this defines a value number
-/// in IntB.  If the source value number (in IntA) is defined by a copy from B,
-/// see if we can merge these two pieces of B into a single value number,
-/// eliminating a copy.  For example:
-///
-///  A3 = B0
-///    ...
-///  B1 = A3      <- this copy
-///
-/// In this case, B0 can be extended to where the B1 copy lives, allowing the B1
-/// value number to be replaced with B0 (which simplifies the B liveinterval).
-///
-/// This returns true if an interval was modified.
-///
 bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
                                              MachineInstr *CopyMI) {
   assert(!CP.isPartial() && "This doesn't work for partial copies.");
@@ -433,6 +439,20 @@
     LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
   SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
 
+  // We have a non-trivially-coalescable copy with IntA being the source and
+  // IntB being the dest, thus this defines a value number in IntB.  If the
+  // source value number (in IntA) is defined by a copy from B, see if we can
+  // merge these two pieces of B into a single value number, eliminating a copy.
+  // For example:
+  //
+  //  A3 = B0
+  //    ...
+  //  B1 = A3      <- this copy
+  //
+  // In this case, B0 can be extended to where the B1 copy lives, allowing the
+  // B1 value number to be replaced with B0 (which simplifies the B
+  // liveinterval).
+
   // BValNo is a value number in B that is defined by a copy from A.  'B1' in
   // the example above.
   LiveInterval::iterator BS = IntB.FindSegmentContaining(CopyIdx);
@@ -492,6 +512,16 @@
   // Okay, merge "B1" into the same value number as "B0".
   if (BValNo != ValS->valno)
     IntB.MergeValueNumberInto(BValNo, ValS->valno);
+
+  // Do the same for the subregister segments.
+  for (LiveInterval::SubRange &S : IntB.subranges()) {
+    VNInfo *SubBValNo = S.getVNInfoAt(CopyIdx);
+    S.addSegment(LiveInterval::Segment(FillerStart, FillerEnd, SubBValNo));
+    VNInfo *SubValSNo = S.getVNInfoAt(AValNo->def.getPrevSlot());
+    if (SubBValNo != SubValSNo)
+      S.MergeValueNumberInto(SubBValNo, SubValSNo);
+  }
+
   DEBUG(dbgs() << "   result = " << IntB << '\n');
 
   // If the source instruction was killing the source register before the
@@ -512,8 +542,6 @@
   return true;
 }
 
-/// Return true if there are definitions of IntB
-/// other than BValNo val# that can reach uses of AValno val# of IntA.
 bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
                                              LiveInterval &IntB,
                                              VNInfo *AValNo,
@@ -523,69 +551,75 @@
   if (LIS->hasPHIKill(IntA, AValNo))
     return true;
 
-  for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
-       AI != AE; ++AI) {
-    if (AI->valno != AValNo) continue;
+  for (LiveRange::Segment &ASeg : IntA.segments) {
+    if (ASeg.valno != AValNo) continue;
     LiveInterval::iterator BI =
-      std::upper_bound(IntB.begin(), IntB.end(), AI->start);
+      std::upper_bound(IntB.begin(), IntB.end(), ASeg.start);
     if (BI != IntB.begin())
       --BI;
-    for (; BI != IntB.end() && AI->end >= BI->start; ++BI) {
+    for (; BI != IntB.end() && ASeg.end >= BI->start; ++BI) {
       if (BI->valno == BValNo)
         continue;
-      if (BI->start <= AI->start && BI->end > AI->start)
+      if (BI->start <= ASeg.start && BI->end > ASeg.start)
         return true;
-      if (BI->start > AI->start && BI->start < AI->end)
+      if (BI->start > ASeg.start && BI->start < ASeg.end)
         return true;
     }
   }
   return false;
 }
 
-/// We found a non-trivially-coalescable copy with
-/// IntA being the source and IntB being the dest, thus this defines a value
-/// number in IntB.  If the source value number (in IntA) is defined by a
-/// commutable instruction and its other operand is coalesced to the copy dest
-/// register, see if we can transform the copy into a noop by commuting the
-/// definition. For example,
-///
-///  A3 = op A2 B0<kill>
-///    ...
-///  B1 = A3      <- this copy
-///    ...
-///     = op A3   <- more uses
-///
-/// ==>
-///
-///  B2 = op B0 A2<kill>
-///    ...
-///  B1 = B2      <- now an identify copy
-///    ...
-///     = op B2   <- more uses
-///
-/// This returns true if an interval was modified.
-///
+/// Copy segements with value number @p SrcValNo from liverange @p Src to live
+/// range @Dst and use value number @p DstValNo there.
+static void addSegmentsWithValNo(LiveRange &Dst, VNInfo *DstValNo,
+                                 const LiveRange &Src, const VNInfo *SrcValNo)
+{
+  for (const LiveRange::Segment &S : Src.segments) {
+    if (S.valno != SrcValNo)
+      continue;
+    Dst.addSegment(LiveRange::Segment(S.start, S.end, DstValNo));
+  }
+}
+
 bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
                                                  MachineInstr *CopyMI) {
-  assert (!CP.isPhys());
-
-  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
+  assert(!CP.isPhys());
 
   LiveInterval &IntA =
-    LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+      LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
   LiveInterval &IntB =
-    LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+      LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+
+  // We found a non-trivially-coalescable copy with IntA being the source and
+  // IntB being the dest, thus this defines a value number in IntB.  If the
+  // source value number (in IntA) is defined by a commutable instruction and
+  // its other operand is coalesced to the copy dest register, see if we can
+  // transform the copy into a noop by commuting the definition. For example,
+  //
+  //  A3 = op A2 B0<kill>
+  //    ...
+  //  B1 = A3      <- this copy
+  //    ...
+  //     = op A3   <- more uses
+  //
+  // ==>
+  //
+  //  B2 = op B0 A2<kill>
+  //    ...
+  //  B1 = B2      <- now an identity copy
+  //    ...
+  //     = op B2   <- more uses
 
   // BValNo is a value number in B that is defined by a copy from A. 'B1' in
   // the example above.
+  SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
   VNInfo *BValNo = IntB.getVNInfoAt(CopyIdx);
-  if (!BValNo || BValNo->def != CopyIdx)
-    return false;
+  assert(BValNo != nullptr && BValNo->def == CopyIdx);
 
   // AValNo is the value number in A that defines the copy, A3 in the example.
   VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx.getRegSlot(true));
-  assert(AValNo && "COPY source not live");
-  if (AValNo->isPHIDef() || AValNo->isUnused())
+  assert(AValNo && !AValNo->isUnused() && "COPY source not live");
+  if (AValNo->isPHIDef())
     return false;
   MachineInstr *DefMI = LIS->getInstructionFromIndex(AValNo->def);
   if (!DefMI)
@@ -652,8 +686,6 @@
     MBB->insert(Pos, NewMI);
     MBB->erase(DefMI);
   }
-  unsigned OpIdx = NewMI->findRegisterUseOperandIdx(IntA.reg, false);
-  NewMI->getOperand(OpIdx).setIsKill();
 
   // If ALR and BLR overlaps and end of BLR extends beyond end of ALR, e.g.
   // A = or A, B
@@ -666,10 +698,13 @@
 
   // Update uses of IntA of the specific Val# with IntB.
   for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(IntA.reg),
-         UE = MRI->use_end(); UI != UE;) {
+                                         UE = MRI->use_end();
+       UI != UE; /* ++UI is below because of possible MI removal */) {
     MachineOperand &UseMO = *UI;
-    MachineInstr *UseMI = UseMO.getParent();
     ++UI;
+    if (UseMO.isUndef())
+      continue;
+    MachineInstr *UseMI = UseMO.getParent();
     if (UseMI->isDebugValue()) {
       // FIXME These don't have an instruction index.  Not clear we have enough
       // info to decide whether to do this replacement or not.  For now do it.
@@ -678,7 +713,8 @@
     }
     SlotIndex UseIdx = LIS->getInstructionIndex(UseMI).getRegSlot(true);
     LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
-    if (US == IntA.end() || US->valno != AValNo)
+    assert(US != IntA.end() && "Use must be live");
+    if (US->valno != AValNo)
       continue;
     // Kill flags are no longer accurate. They are recomputed after RA.
     UseMO.setIsKill(false);
@@ -702,7 +738,16 @@
       continue;
     DEBUG(dbgs() << "\t\tnoop: " << DefIdx << '\t' << *UseMI);
     assert(DVNI->def == DefIdx);
-    BValNo = IntB.MergeValueNumberInto(BValNo, DVNI);
+    BValNo = IntB.MergeValueNumberInto(DVNI, BValNo);
+    for (LiveInterval::SubRange &S : IntB.subranges()) {
+      VNInfo *SubDVNI = S.getVNInfoAt(DefIdx);
+      if (!SubDVNI)
+        continue;
+      VNInfo *SubBValNo = S.getVNInfoAt(CopyIdx);
+      assert(SubBValNo->def == CopyIdx);
+      S.MergeValueNumberInto(SubDVNI, SubBValNo);
+    }
+
     ErasedInstrs.insert(UseMI);
     LIS->RemoveMachineInstrFromMaps(UseMI);
     UseMI->eraseFromParent();
@@ -710,23 +755,82 @@
 
   // Extend BValNo by merging in IntA live segments of AValNo. Val# definition
   // is updated.
-  VNInfo *ValNo = BValNo;
-  ValNo->def = AValNo->def;
-  for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
-       AI != AE; ++AI) {
-    if (AI->valno != AValNo) continue;
-    IntB.addSegment(LiveInterval::Segment(AI->start, AI->end, ValNo));
+  BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+  if (IntB.hasSubRanges()) {
+    if (!IntA.hasSubRanges()) {
+      unsigned Mask = MRI->getMaxLaneMaskForVReg(IntA.reg);
+      IntA.createSubRangeFrom(Allocator, Mask, IntA);
+    }
+    SlotIndex AIdx = CopyIdx.getRegSlot(true);
+    for (LiveInterval::SubRange &SA : IntA.subranges()) {
+      VNInfo *ASubValNo = SA.getVNInfoAt(AIdx);
+      assert(ASubValNo != nullptr);
+
+      unsigned AMask = SA.LaneMask;
+      for (LiveInterval::SubRange &SB : IntB.subranges()) {
+        unsigned BMask = SB.LaneMask;
+        unsigned Common = BMask & AMask;
+        if (Common == 0)
+          continue;
+
+        DEBUG(
+            dbgs() << format("\t\tCopy+Merge %04X into %04X\n", BMask, Common));
+        unsigned BRest = BMask & ~AMask;
+        LiveInterval::SubRange *CommonRange;
+        if (BRest != 0) {
+          SB.LaneMask = BRest;
+          DEBUG(dbgs() << format("\t\tReduce Lane to %04X\n", BRest));
+          // Duplicate SubRange for newly merged common stuff.
+          CommonRange = IntB.createSubRangeFrom(Allocator, Common, SB);
+        } else {
+          // We van reuse the L SubRange.
+          SB.LaneMask = Common;
+          CommonRange = &SB;
+        }
+        LiveRange RangeCopy(SB, Allocator);
+
+        VNInfo *BSubValNo = CommonRange->getVNInfoAt(CopyIdx);
+        assert(BSubValNo->def == CopyIdx);
+        BSubValNo->def = ASubValNo->def;
+        addSegmentsWithValNo(*CommonRange, BSubValNo, SA, ASubValNo);
+        AMask &= ~BMask;
+      }
+      if (AMask != 0) {
+        DEBUG(dbgs() << format("\t\tNew Lane %04X\n", AMask));
+        LiveRange *NewRange = IntB.createSubRange(Allocator, AMask);
+        VNInfo *BSubValNo = NewRange->getNextValue(CopyIdx, Allocator);
+        addSegmentsWithValNo(*NewRange, BSubValNo, SA, ASubValNo);
+      }
+    }
   }
+
+  BValNo->def = AValNo->def;
+  addSegmentsWithValNo(IntB, BValNo, IntA, AValNo);
   DEBUG(dbgs() << "\t\textended: " << IntB << '\n');
 
-  IntA.removeValNo(AValNo);
+  LIS->removeVRegDefAt(IntA, AValNo->def);
+
   DEBUG(dbgs() << "\t\ttrimmed:  " << IntA << '\n');
   ++numCommutes;
   return true;
 }
 
-/// If the source of a copy is defined by a trivial
-/// computation, replace the copy by rematerialize the definition.
+/// Returns true if @p MI defines the full vreg @p Reg, as opposed to just
+/// defining a subregister.
+static bool definesFullReg(const MachineInstr &MI, unsigned Reg) {
+  assert(!TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         "This code cannot handle physreg aliasing");
+  for (const MachineOperand &Op : MI.operands()) {
+    if (!Op.isReg() || !Op.isDef() || Op.getReg() != Reg)
+      continue;
+    // Return true if we define the full register or don't care about the value
+    // inside other subregisters.
+    if (Op.getSubReg() == 0 || Op.isUndef())
+      return true;
+  }
+  return false;
+}
+
 bool RegisterCoalescer::reMaterializeTrivialDef(CoalescerPair &CP,
                                                 MachineInstr *CopyMI,
                                                 bool &IsDefCopy) {
@@ -755,6 +859,8 @@
     return false;
   if (!TII->isTriviallyReMaterializable(DefMI, AA))
     return false;
+  if (!definesFullReg(*DefMI, SrcReg))
+    return false;
   bool SawStore = false;
   if (!DefMI->isSafeToMove(TII, AA, SawStore))
     return false;
@@ -825,12 +931,13 @@
     const TargetRegisterClass *NewRC = CP.getNewRC();
     unsigned NewIdx = NewMI->getOperand(0).getSubReg();
 
-    if (NewIdx)
-      NewRC = TRI->getMatchingSuperRegClass(NewRC, DefRC, NewIdx);
-    else
-      NewRC = TRI->getCommonSubClass(NewRC, DefRC);
-
-    assert(NewRC && "subreg chosen for remat incompatible with instruction");
+    if (DefRC != nullptr) {
+      if (NewIdx)
+        NewRC = TRI->getMatchingSuperRegClass(NewRC, DefRC, NewIdx);
+      else
+        NewRC = TRI->getCommonSubClass(NewRC, DefRC);
+      assert(NewRC && "subreg chosen for remat incompatible with instruction");
+    }
     MRI->setRegClass(DstReg, NewRC);
 
     updateRegDefsUses(DstReg, DstReg, DstIdx);
@@ -898,56 +1005,103 @@
 
   // The source interval can become smaller because we removed a use.
   LIS->shrinkToUses(&SrcInt, &DeadDefs);
-  if (!DeadDefs.empty())
+  if (!DeadDefs.empty()) {
+    // If the virtual SrcReg is completely eliminated, update all DBG_VALUEs
+    // to describe DstReg instead.
+    for (MachineOperand &UseMO : MRI->use_operands(SrcReg)) {
+      MachineInstr *UseMI = UseMO.getParent();
+      if (UseMI->isDebugValue()) {
+        UseMO.setReg(DstReg);
+        DEBUG(dbgs() << "\t\tupdated: " << *UseMI);
+      }
+    }
     eliminateDeadDefs();
+  }
 
   return true;
 }
 
-/// ProcessImpicitDefs may leave some copies of <undef>
-/// values, it only removes local variables. When we have a copy like:
-///
-///   %vreg1 = COPY %vreg2<undef>
-///
-/// We delete the copy and remove the corresponding value number from %vreg1.
-/// Any uses of that value number are marked as <undef>.
-bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI,
-                                           const CoalescerPair &CP) {
+bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
+  // ProcessImpicitDefs may leave some copies of <undef> values, it only removes
+  // local variables. When we have a copy like:
+  //
+  //   %vreg1 = COPY %vreg2<undef>
+  //
+  // We delete the copy and remove the corresponding value number from %vreg1.
+  // Any uses of that value number are marked as <undef>.
+
+  // Note that we do not query CoalescerPair here but redo isMoveInstr as the
+  // CoalescerPair may have a new register class with adjusted subreg indices
+  // at this point.
+  unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+  isMoveInstr(*TRI, CopyMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx);
+
   SlotIndex Idx = LIS->getInstructionIndex(CopyMI);
-  LiveInterval *SrcInt = &LIS->getInterval(CP.getSrcReg());
-  if (SrcInt->liveAt(Idx))
-    return false;
-  LiveInterval *DstInt = &LIS->getInterval(CP.getDstReg());
-  if (DstInt->liveAt(Idx))
+  const LiveInterval &SrcLI = LIS->getInterval(SrcReg);
+  // CopyMI is undef iff SrcReg is not live before the instruction.
+  if (SrcSubIdx != 0 && SrcLI.hasSubRanges()) {
+    unsigned SrcMask = TRI->getSubRegIndexLaneMask(SrcSubIdx);
+    for (const LiveInterval::SubRange &SR : SrcLI.subranges()) {
+      if ((SR.LaneMask & SrcMask) == 0)
+        continue;
+      if (SR.liveAt(Idx))
+        return false;
+    }
+  } else if (SrcLI.liveAt(Idx))
     return false;
 
-  // No intervals are live-in to CopyMI - it is undef.
-  if (CP.isFlipped())
-    DstInt = SrcInt;
-  SrcInt = nullptr;
+  DEBUG(dbgs() << "\tEliminating copy of <undef> value\n");
 
-  VNInfo *DeadVNI = DstInt->getVNInfoAt(Idx.getRegSlot());
-  assert(DeadVNI && "No value defined in DstInt");
-  DstInt->removeValNo(DeadVNI);
+  // Remove any DstReg segments starting at the instruction.
+  LiveInterval &DstLI = LIS->getInterval(DstReg);
+  SlotIndex RegIndex = Idx.getRegSlot();
+  // Remove value or merge with previous one in case of a subregister def.
+  if (VNInfo *PrevVNI = DstLI.getVNInfoAt(Idx)) {
+    VNInfo *VNI = DstLI.getVNInfoAt(RegIndex);
+    DstLI.MergeValueNumberInto(VNI, PrevVNI);
 
-  // Find new undef uses.
-  for (MachineOperand &MO : MRI->reg_nodbg_operands(DstInt->reg)) {
-    if (MO.isDef() || MO.isUndef())
+    // The affected subregister segments can be removed.
+    unsigned DstMask = TRI->getSubRegIndexLaneMask(DstSubIdx);
+    for (LiveInterval::SubRange &SR : DstLI.subranges()) {
+      if ((SR.LaneMask & DstMask) == 0)
+        continue;
+
+      VNInfo *SVNI = SR.getVNInfoAt(RegIndex);
+      assert(SVNI != nullptr && SlotIndex::isSameInstr(SVNI->def, RegIndex));
+      SR.removeValNo(SVNI);
+    }
+    DstLI.removeEmptySubRanges();
+  } else
+    LIS->removeVRegDefAt(DstLI, RegIndex);
+
+  // Mark uses as undef.
+  for (MachineOperand &MO : MRI->reg_nodbg_operands(DstReg)) {
+    if (MO.isDef() /*|| MO.isUndef()*/)
       continue;
-    MachineInstr *MI = MO.getParent();
-    SlotIndex Idx = LIS->getInstructionIndex(MI);
-    if (DstInt->liveAt(Idx))
+    const MachineInstr &MI = *MO.getParent();
+    SlotIndex UseIdx = LIS->getInstructionIndex(&MI);
+    unsigned UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
+    bool isLive;
+    if (UseMask != ~0u && DstLI.hasSubRanges()) {
+      isLive = false;
+      for (const LiveInterval::SubRange &SR : DstLI.subranges()) {
+        if ((SR.LaneMask & UseMask) == 0)
+          continue;
+        if (SR.liveAt(UseIdx)) {
+          isLive = true;
+          break;
+        }
+      }
+    } else
+      isLive = DstLI.liveAt(UseIdx);
+    if (isLive)
       continue;
     MO.setIsUndef(true);
-    DEBUG(dbgs() << "\tnew undef: " << Idx << '\t' << *MI);
+    DEBUG(dbgs() << "\tnew undef: " << UseIdx << '\t' << MI);
   }
   return true;
 }
 
-/// Replace all defs and uses of SrcReg to DstReg and update the subregister
-/// number if it is not zero. If DstReg is a physical register and the existing
-/// subregister number of the def / use being updated is not zero, make sure to
-/// set it to the correct physical subregister.
 void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
                                           unsigned DstReg,
                                           unsigned SubIdx) {
@@ -987,6 +1141,40 @@
       if (SubIdx && MO.isDef())
         MO.setIsUndef(!Reads);
 
+      // A subreg use of a partially undef (super) register may be a complete
+      // undef use now and then has to be marked that way.
+      if (SubIdx != 0 && MO.isUse() && MRI->tracksSubRegLiveness()) {
+        if (!DstInt->hasSubRanges()) {
+          BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+          unsigned Mask = MRI->getMaxLaneMaskForVReg(DstInt->reg);
+          DstInt->createSubRangeFrom(Allocator, Mask, *DstInt);
+        }
+        unsigned Mask = TRI->getSubRegIndexLaneMask(SubIdx);
+        bool IsUndef = true;
+        SlotIndex MIIdx = UseMI->isDebugValue()
+          ? LIS->getSlotIndexes()->getIndexBefore(UseMI)
+          : LIS->getInstructionIndex(UseMI);
+        SlotIndex UseIdx = MIIdx.getRegSlot(true);
+        for (LiveInterval::SubRange &S : DstInt->subranges()) {
+          if ((S.LaneMask & Mask) == 0)
+            continue;
+          if (S.liveAt(UseIdx)) {
+            IsUndef = false;
+            break;
+          }
+        }
+        if (IsUndef) {
+          MO.setIsUndef(true);
+          // We found out some subregister use is actually reading an undefined
+          // value. In some cases the whole vreg has become undefined at this
+          // point so we have to potentially shrink the main range if the
+          // use was ending a live segment there.
+          LiveQueryResult Q = DstInt->Query(MIIdx);
+          if (Q.valueOut() == nullptr)
+            ShrinkMainRange = true;
+        }
+      }
+
       if (DstIsPhys)
         MO.substPhysReg(DstReg, *TRI);
       else
@@ -1002,29 +1190,23 @@
   }
 }
 
-/// Return true if a copy involving a physreg should be joined.
 bool RegisterCoalescer::canJoinPhys(const CoalescerPair &CP) {
-  /// Always join simple intervals that are defined by a single copy from a
-  /// reserved register. This doesn't increase register pressure, so it is
-  /// always beneficial.
+  // Always join simple intervals that are defined by a single copy from a
+  // reserved register. This doesn't increase register pressure, so it is
+  // always beneficial.
   if (!MRI->isReserved(CP.getDstReg())) {
     DEBUG(dbgs() << "\tCan only merge into reserved registers.\n");
     return false;
   }
 
   LiveInterval &JoinVInt = LIS->getInterval(CP.getSrcReg());
-  if (CP.isFlipped() && JoinVInt.containsOneValue())
+  if (JoinVInt.containsOneValue())
     return true;
 
-  DEBUG(dbgs() << "\tCannot join defs into reserved register.\n");
+  DEBUG(dbgs() << "\tCannot join complex intervals into reserved register.\n");
   return false;
 }
 
-/// Attempt to join intervals corresponding to SrcReg/DstReg,
-/// which are the src/dst of the copy instruction CopyMI.  This returns true
-/// if the copy was successfully coalesced away. If it is not currently
-/// possible to coalesce this interval, but it may be possible if other
-/// things get coalesced, then it returns true by reference in 'Again'.
 bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
 
   Again = false;
@@ -1063,8 +1245,7 @@
   }
 
   // Eliminate undefs.
-  if (!CP.isPhys() && eliminateUndefCopy(CopyMI, CP)) {
-    DEBUG(dbgs() << "\tEliminated copy of <undef> value.\n");
+  if (!CP.isPhys() && eliminateUndefCopy(CopyMI)) {
     LIS->RemoveMachineInstrFromMaps(CopyMI);
     CopyMI->eraseFromParent();
     return false;  // Not coalescable.
@@ -1076,12 +1257,22 @@
   if (CP.getSrcReg() == CP.getDstReg()) {
     LiveInterval &LI = LIS->getInterval(CP.getSrcReg());
     DEBUG(dbgs() << "\tCopy already coalesced: " << LI << '\n');
-    LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(CopyMI));
+    const SlotIndex CopyIdx = LIS->getInstructionIndex(CopyMI);
+    LiveQueryResult LRQ = LI.Query(CopyIdx);
     if (VNInfo *DefVNI = LRQ.valueDefined()) {
       VNInfo *ReadVNI = LRQ.valueIn();
       assert(ReadVNI && "No value before copy and no <undef> flag.");
       assert(ReadVNI != DefVNI && "Cannot read and define the same value.");
       LI.MergeValueNumberInto(DefVNI, ReadVNI);
+
+      // Process subregister liveranges.
+      for (LiveInterval::SubRange &S : LI.subranges()) {
+        LiveQueryResult SLRQ = S.Query(CopyIdx);
+        if (VNInfo *SDefVNI = SLRQ.valueDefined()) {
+          VNInfo *SReadVNI = SLRQ.valueIn();
+          S.MergeValueNumberInto(SDefVNI, SReadVNI);
+        }
+      }
       DEBUG(dbgs() << "\tMerged values:          " << LI << '\n');
     }
     LIS->RemoveMachineInstrFromMaps(CopyMI);
@@ -1124,6 +1315,9 @@
     });
   }
 
+  ShrinkMask = 0;
+  ShrinkMainRange = false;
+
   // Okay, attempt to join these two intervals.  On failure, this returns false.
   // Otherwise, if one of the intervals being joined is a physreg, this method
   // always canonicalizes DstInt to be it.  The output "SrcInt" will not have
@@ -1178,12 +1372,28 @@
     updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx());
   updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx());
 
+  // Shrink subregister ranges if necessary.
+  if (ShrinkMask != 0) {
+    LiveInterval &LI = LIS->getInterval(CP.getDstReg());
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      if ((S.LaneMask & ShrinkMask) == 0)
+        continue;
+      DEBUG(dbgs() << "Shrink LaneUses (Lane "
+                   << format("%04X", S.LaneMask) << ")\n");
+      LIS->shrinkToUses(S, LI.reg);
+    }
+  }
+  if (ShrinkMainRange) {
+    LiveInterval &LI = LIS->getInterval(CP.getDstReg());
+    LIS->shrinkToUses(&LI);
+  }
+
   // SrcReg is guaranteed to be the register whose live interval that is
   // being merged.
   LIS->removeInterval(CP.getSrcReg());
 
   // Update regalloc hint.
-  TRI->UpdateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF);
+  TRI->updateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF);
 
   DEBUG({
     dbgs() << "\tSuccess: " << PrintReg(CP.getSrcReg(), TRI, CP.getSrcIdx())
@@ -1200,24 +1410,23 @@
   return true;
 }
 
-/// Attempt joining with a reserved physreg.
 bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
+  unsigned DstReg = CP.getDstReg();
   assert(CP.isPhys() && "Must be a physreg copy");
-  assert(MRI->isReserved(CP.getDstReg()) && "Not a reserved register");
+  assert(MRI->isReserved(DstReg) && "Not a reserved register");
   LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
   DEBUG(dbgs() << "\t\tRHS = " << RHS << '\n');
 
-  assert(CP.isFlipped() && RHS.containsOneValue() &&
-         "Invalid join with reserved register");
+  assert(RHS.containsOneValue() && "Invalid join with reserved register");
 
   // Optimization for reserved registers like ESP. We can only merge with a
-  // reserved physreg if RHS has a single value that is a copy of CP.DstReg().
+  // reserved physreg if RHS has a single value that is a copy of DstReg.
   // The live range of the reserved register will look like a set of dead defs
   // - we don't properly track the live range of reserved registers.
 
   // Deny any overlapping intervals.  This depends on all the reserved
   // register live ranges to look like dead defs.
-  for (MCRegUnitIterator UI(CP.getDstReg(), TRI); UI.isValid(); ++UI)
+  for (MCRegUnitIterator UI(DstReg, TRI); UI.isValid(); ++UI)
     if (RHS.overlaps(LIS->getRegUnit(*UI))) {
       DEBUG(dbgs() << "\t\tInterference: " << PrintRegUnit(*UI, TRI) << '\n');
       return false;
@@ -1229,7 +1438,46 @@
   // defs are there.
 
   // Delete the identity copy.
-  MachineInstr *CopyMI = MRI->getVRegDef(RHS.reg);
+  MachineInstr *CopyMI;
+  if (CP.isFlipped()) {
+    CopyMI = MRI->getVRegDef(RHS.reg);
+  } else {
+    if (!MRI->hasOneNonDBGUse(RHS.reg)) {
+      DEBUG(dbgs() << "\t\tMultiple vreg uses!\n");
+      return false;
+    }
+
+    MachineInstr *DestMI = MRI->getVRegDef(RHS.reg);
+    CopyMI = &*MRI->use_instr_nodbg_begin(RHS.reg);
+    const SlotIndex CopyRegIdx = LIS->getInstructionIndex(CopyMI).getRegSlot();
+    const SlotIndex DestRegIdx = LIS->getInstructionIndex(DestMI).getRegSlot();
+
+    // We checked above that there are no interfering defs of the physical
+    // register. However, for this case, where we intent to move up the def of
+    // the physical register, we also need to check for interfering uses.
+    SlotIndexes *Indexes = LIS->getSlotIndexes();
+    for (SlotIndex SI = Indexes->getNextNonNullIndex(DestRegIdx);
+         SI != CopyRegIdx; SI = Indexes->getNextNonNullIndex(SI)) {
+      MachineInstr *MI = LIS->getInstructionFromIndex(SI);
+      if (MI->readsRegister(DstReg, TRI)) {
+        DEBUG(dbgs() << "\t\tInterference (read): " << *MI);
+        return false;
+      }
+    }
+
+    // We're going to remove the copy which defines a physical reserved
+    // register, so remove its valno, etc.
+    DEBUG(dbgs() << "\t\tRemoving phys reg def of " << DstReg << " at "
+          << CopyRegIdx << "\n");
+
+    LIS->removePhysRegDefAt(DstReg, CopyRegIdx);
+    // Create a new dead def at the new def location.
+    for (MCRegUnitIterator UI(DstReg, TRI); UI.isValid(); ++UI) {
+      LiveRange &LR = LIS->getRegUnit(*UI);
+      LR.createDeadDef(DestRegIdx, LIS->getVNInfoAllocator());
+    }
+  }
+
   LIS->RemoveMachineInstrFromMaps(CopyMI);
   CopyMI->eraseFromParent();
 
@@ -1306,15 +1554,29 @@
 namespace {
 /// Track information about values in a single virtual register about to be
 /// joined. Objects of this class are always created in pairs - one for each
-/// side of the CoalescerPair.
+/// side of the CoalescerPair (or one for each lane of a side of the coalescer
+/// pair)
 class JoinVals {
-  LiveInterval &LI;
+  /// Live range we work on.
+  LiveRange &LR;
+  /// (Main) register we work on.
+  const unsigned Reg;
 
-  // Location of this register in the final joined register.
-  // Either CP.DstIdx or CP.SrcIdx.
-  unsigned SubIdx;
+  /// Reg (and therefore the values in this liverange) will end up as
+  /// subregister SubIdx in the coalesced register. Either CP.DstIdx or
+  /// CP.SrcIdx.
+  const unsigned SubIdx;
+  /// The LaneMask that this liverange will occupy the coalesced register. May
+  /// be smaller than the lanemask produced by SubIdx when merging subranges.
+  const unsigned LaneMask;
 
-  // Values that will be present in the final live range.
+  /// This is true when joining sub register ranges, false when joining main
+  /// ranges.
+  const bool SubRangeJoin;
+  /// Whether the current LiveInterval tracks subregister liveness.
+  const bool TrackSubRegLiveness;
+
+  /// Values that will be present in the final live range.
   SmallVectorImpl<VNInfo*> &NewVNInfo;
 
   const CoalescerPair &CP;
@@ -1322,75 +1584,75 @@
   SlotIndexes *Indexes;
   const TargetRegisterInfo *TRI;
 
-  // Value number assignments. Maps value numbers in LI to entries in NewVNInfo.
-  // This is suitable for passing to LiveInterval::join().
+  /// Value number assignments. Maps value numbers in LI to entries in
+  /// NewVNInfo. This is suitable for passing to LiveInterval::join().
   SmallVector<int, 8> Assignments;
 
-  // Conflict resolution for overlapping values.
+  /// Conflict resolution for overlapping values.
   enum ConflictResolution {
-    // No overlap, simply keep this value.
+    /// No overlap, simply keep this value.
     CR_Keep,
 
-    // Merge this value into OtherVNI and erase the defining instruction.
-    // Used for IMPLICIT_DEF, coalescable copies, and copies from external
-    // values.
+    /// Merge this value into OtherVNI and erase the defining instruction.
+    /// Used for IMPLICIT_DEF, coalescable copies, and copies from external
+    /// values.
     CR_Erase,
 
-    // Merge this value into OtherVNI but keep the defining instruction.
-    // This is for the special case where OtherVNI is defined by the same
-    // instruction.
+    /// Merge this value into OtherVNI but keep the defining instruction.
+    /// This is for the special case where OtherVNI is defined by the same
+    /// instruction.
     CR_Merge,
 
-    // Keep this value, and have it replace OtherVNI where possible. This
-    // complicates value mapping since OtherVNI maps to two different values
-    // before and after this def.
-    // Used when clobbering undefined or dead lanes.
+    /// Keep this value, and have it replace OtherVNI where possible. This
+    /// complicates value mapping since OtherVNI maps to two different values
+    /// before and after this def.
+    /// Used when clobbering undefined or dead lanes.
     CR_Replace,
 
-    // Unresolved conflict. Visit later when all values have been mapped.
+    /// Unresolved conflict. Visit later when all values have been mapped.
     CR_Unresolved,
 
-    // Unresolvable conflict. Abort the join.
+    /// Unresolvable conflict. Abort the join.
     CR_Impossible
   };
 
-  // Per-value info for LI. The lane bit masks are all relative to the final
-  // joined register, so they can be compared directly between SrcReg and
-  // DstReg.
+  /// Per-value info for LI. The lane bit masks are all relative to the final
+  /// joined register, so they can be compared directly between SrcReg and
+  /// DstReg.
   struct Val {
     ConflictResolution Resolution;
 
-    // Lanes written by this def, 0 for unanalyzed values.
+    /// Lanes written by this def, 0 for unanalyzed values.
     unsigned WriteLanes;
 
-    // Lanes with defined values in this register. Other lanes are undef and
-    // safe to clobber.
+    /// Lanes with defined values in this register. Other lanes are undef and
+    /// safe to clobber.
     unsigned ValidLanes;
 
-    // Value in LI being redefined by this def.
+    /// Value in LI being redefined by this def.
     VNInfo *RedefVNI;
 
-    // Value in the other live range that overlaps this def, if any.
+    /// Value in the other live range that overlaps this def, if any.
     VNInfo *OtherVNI;
 
-    // Is this value an IMPLICIT_DEF that can be erased?
-    //
-    // IMPLICIT_DEF values should only exist at the end of a basic block that
-    // is a predecessor to a phi-value. These IMPLICIT_DEF instructions can be
-    // safely erased if they are overlapping a live value in the other live
-    // interval.
-    //
-    // Weird control flow graphs and incomplete PHI handling in
-    // ProcessImplicitDefs can very rarely create IMPLICIT_DEF values with
-    // longer live ranges. Such IMPLICIT_DEF values should be treated like
-    // normal values.
+    /// Is this value an IMPLICIT_DEF that can be erased?
+    ///
+    /// IMPLICIT_DEF values should only exist at the end of a basic block that
+    /// is a predecessor to a phi-value. These IMPLICIT_DEF instructions can be
+    /// safely erased if they are overlapping a live value in the other live
+    /// interval.
+    ///
+    /// Weird control flow graphs and incomplete PHI handling in
+    /// ProcessImplicitDefs can very rarely create IMPLICIT_DEF values with
+    /// longer live ranges. Such IMPLICIT_DEF values should be treated like
+    /// normal values.
     bool ErasableImplicitDef;
 
-    // True when the live range of this value will be pruned because of an
-    // overlapping CR_Replace value in the other live range.
+    /// True when the live range of this value will be pruned because of an
+    /// overlapping CR_Replace value in the other live range.
     bool Pruned;
 
-    // True once Pruned above has been computed.
+    /// True once Pruned above has been computed.
     bool PrunedComputed;
 
     Val() : Resolution(CR_Keep), WriteLanes(0), ValidLanes(0),
@@ -1400,30 +1662,75 @@
     bool isAnalyzed() const { return WriteLanes != 0; }
   };
 
-  // One entry per value number in LI.
+  /// One entry per value number in LI.
   SmallVector<Val, 8> Vals;
 
-  unsigned computeWriteLanes(const MachineInstr *DefMI, bool &Redef);
-  VNInfo *stripCopies(VNInfo *VNI);
+  /// Compute the bitmask of lanes actually written by DefMI.
+  /// Set Redef if there are any partial register definitions that depend on the
+  /// previous value of the register.
+  unsigned computeWriteLanes(const MachineInstr *DefMI, bool &Redef) const;
+
+  /// Find the ultimate value that VNI was copied from.
+  std::pair<const VNInfo*,unsigned> followCopyChain(const VNInfo *VNI) const;
+
+  bool valuesIdentical(VNInfo *Val0, VNInfo *Val1, const JoinVals &Other) const;
+
+  /// Analyze ValNo in this live range, and set all fields of Vals[ValNo].
+  /// Return a conflict resolution when possible, but leave the hard cases as
+  /// CR_Unresolved.
+  /// Recursively calls computeAssignment() on this and Other, guaranteeing that
+  /// both OtherVNI and RedefVNI have been analyzed and mapped before returning.
+  /// The recursion always goes upwards in the dominator tree, making loops
+  /// impossible.
   ConflictResolution analyzeValue(unsigned ValNo, JoinVals &Other);
+
+  /// Compute the value assignment for ValNo in RI.
+  /// This may be called recursively by analyzeValue(), but never for a ValNo on
+  /// the stack.
   void computeAssignment(unsigned ValNo, JoinVals &Other);
+
+  /// Assuming ValNo is going to clobber some valid lanes in Other.LR, compute
+  /// the extent of the tainted lanes in the block.
+  ///
+  /// Multiple values in Other.LR can be affected since partial redefinitions
+  /// can preserve previously tainted lanes.
+  ///
+  ///   1 %dst = VLOAD           <-- Define all lanes in %dst
+  ///   2 %src = FOO             <-- ValNo to be joined with %dst:ssub0
+  ///   3 %dst:ssub1 = BAR       <-- Partial redef doesn't clear taint in ssub0
+  ///   4 %dst:ssub0 = COPY %src <-- Conflict resolved, ssub0 wasn't read
+  ///
+  /// For each ValNo in Other that is affected, add an (EndIndex, TaintedLanes)
+  /// entry to TaintedVals.
+  ///
+  /// Returns false if the tainted lanes extend beyond the basic block.
   bool taintExtent(unsigned, unsigned, JoinVals&,
                    SmallVectorImpl<std::pair<SlotIndex, unsigned> >&);
-  bool usesLanes(MachineInstr *MI, unsigned, unsigned, unsigned);
+
+  /// Return true if MI uses any of the given Lanes from Reg.
+  /// This does not include partial redefinitions of Reg.
+  bool usesLanes(const MachineInstr *MI, unsigned, unsigned, unsigned) const;
+
+  /// Determine if ValNo is a copy of a value number in LR or Other.LR that will
+  /// be pruned:
+  ///
+  ///   %dst = COPY %src
+  ///   %src = COPY %dst  <-- This value to be pruned.
+  ///   %dst = COPY %src  <-- This value is a copy of a pruned value.
   bool isPrunedValue(unsigned ValNo, JoinVals &Other);
 
 public:
-  JoinVals(LiveInterval &li, unsigned subIdx,
-           SmallVectorImpl<VNInfo*> &newVNInfo,
-           const CoalescerPair &cp,
-           LiveIntervals *lis,
-           const TargetRegisterInfo *tri)
-    : LI(li), SubIdx(subIdx), NewVNInfo(newVNInfo), CP(cp), LIS(lis),
-      Indexes(LIS->getSlotIndexes()), TRI(tri),
-      Assignments(LI.getNumValNums(), -1), Vals(LI.getNumValNums())
+  JoinVals(LiveRange &LR, unsigned Reg, unsigned SubIdx, unsigned LaneMask,
+           SmallVectorImpl<VNInfo*> &newVNInfo, const CoalescerPair &cp,
+           LiveIntervals *lis, const TargetRegisterInfo *TRI, bool SubRangeJoin,
+           bool TrackSubRegLiveness)
+    : LR(LR), Reg(Reg), SubIdx(SubIdx), LaneMask(LaneMask),
+      SubRangeJoin(SubRangeJoin), TrackSubRegLiveness(TrackSubRegLiveness),
+      NewVNInfo(newVNInfo), CP(cp), LIS(lis), Indexes(LIS->getSlotIndexes()),
+      TRI(TRI), Assignments(LR.getNumValNums(), -1), Vals(LR.getNumValNums())
   {}
 
-  /// Analyze defs in LI and compute a value mapping in NewVNInfo.
+  /// Analyze defs in LR and compute a value mapping in NewVNInfo.
   /// Returns false if any conflicts were impossible to resolve.
   bool mapValues(JoinVals &Other);
 
@@ -1431,10 +1738,16 @@
   /// Returns false if any conflicts were impossible to resolve.
   bool resolveConflicts(JoinVals &Other);
 
-  /// Prune the live range of values in Other.LI where they would conflict with
-  /// CR_Replace values in LI. Collect end points for restoring the live range
+  /// Prune the live range of values in Other.LR where they would conflict with
+  /// CR_Replace values in LR. Collect end points for restoring the live range
   /// after joining.
-  void pruneValues(JoinVals &Other, SmallVectorImpl<SlotIndex> &EndPoints);
+  void pruneValues(JoinVals &Other, SmallVectorImpl<SlotIndex> &EndPoints,
+                   bool changeInstrs);
+
+  /// Removes subranges starting at copies that get removed. This sometimes
+  /// happens when undefined subranges are copied around. These ranges contain
+  /// no usefull information and can be removed.
+  void pruneSubRegValues(LiveInterval &LI, unsigned &ShrinkMask);
 
   /// Erase any machine instructions that have been coalesced away.
   /// Add erased instructions to ErasedInstrs.
@@ -1448,13 +1761,11 @@
 };
 } // end anonymous namespace
 
-/// Compute the bitmask of lanes actually written by DefMI.
-/// Set Redef if there are any partial register definitions that depend on the
-/// previous value of the register.
-unsigned JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef) {
+unsigned JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef)
+  const {
   unsigned L = 0;
   for (ConstMIOperands MO(DefMI); MO.isValid(); ++MO) {
-    if (!MO->isReg() || MO->getReg() != LI.reg || !MO->isDef())
+    if (!MO->isReg() || MO->getReg() != Reg || !MO->isDef())
       continue;
     L |= TRI->getSubRegIndexLaneMask(
            TRI->composeSubRegIndices(SubIdx, MO->getSubReg()));
@@ -1464,36 +1775,71 @@
   return L;
 }
 
-/// Find the ultimate value that VNI was copied from.
-VNInfo *JoinVals::stripCopies(VNInfo *VNI) {
+std::pair<const VNInfo*, unsigned> JoinVals::followCopyChain(
+    const VNInfo *VNI) const {
+  unsigned Reg = this->Reg;
+
   while (!VNI->isPHIDef()) {
-    MachineInstr *MI = Indexes->getInstructionFromIndex(VNI->def);
+    SlotIndex Def = VNI->def;
+    MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
     assert(MI && "No defining instruction");
     if (!MI->isFullCopy())
+      return std::make_pair(VNI, Reg);
+    unsigned SrcReg = MI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+      return std::make_pair(VNI, Reg);
+
+    const LiveInterval &LI = LIS->getInterval(SrcReg);
+    const VNInfo *ValueIn;
+    // No subrange involved.
+    if (!SubRangeJoin || !LI.hasSubRanges()) {
+      LiveQueryResult LRQ = LI.Query(Def);
+      ValueIn = LRQ.valueIn();
+    } else {
+      // Query subranges. Pick the first matching one.
+      ValueIn = nullptr;
+      for (const LiveInterval::SubRange &S : LI.subranges()) {
+        // Transform lanemask to a mask in the joined live interval.
+        unsigned SMask = TRI->composeSubRegIndexLaneMask(SubIdx, S.LaneMask);
+        if ((SMask & LaneMask) == 0)
+          continue;
+        LiveQueryResult LRQ = S.Query(Def);
+        ValueIn = LRQ.valueIn();
+        break;
+      }
+    }
+    if (ValueIn == nullptr)
       break;
-    unsigned Reg = MI->getOperand(1).getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
-      break;
-    LiveQueryResult LRQ = LIS->getInterval(Reg).Query(VNI->def);
-    if (!LRQ.valueIn())
-      break;
-    VNI = LRQ.valueIn();
+    VNI = ValueIn;
+    Reg = SrcReg;
   }
-  return VNI;
+  return std::make_pair(VNI, Reg);
 }
 
-/// Analyze ValNo in this live range, and set all fields of Vals[ValNo].
-/// Return a conflict resolution when possible, but leave the hard cases as
-/// CR_Unresolved.
-/// Recursively calls computeAssignment() on this and Other, guaranteeing that
-/// both OtherVNI and RedefVNI have been analyzed and mapped before returning.
-/// The recursion always goes upwards in the dominator tree, making loops
-/// impossible.
+bool JoinVals::valuesIdentical(VNInfo *Value0, VNInfo *Value1,
+                               const JoinVals &Other) const {
+  const VNInfo *Orig0;
+  unsigned Reg0;
+  std::tie(Orig0, Reg0) = followCopyChain(Value0);
+  if (Orig0 == Value1)
+    return true;
+
+  const VNInfo *Orig1;
+  unsigned Reg1;
+  std::tie(Orig1, Reg1) = Other.followCopyChain(Value1);
+
+  // The values are equal if they are defined at the same place and use the
+  // same register. Note that we cannot compare VNInfos directly as some of
+  // them might be from a copy created in mergeSubRangeInto()  while the other
+  // is from the original LiveInterval.
+  return Orig0->def == Orig1->def && Reg0 == Reg1;
+}
+
 JoinVals::ConflictResolution
 JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
   Val &V = Vals[ValNo];
   assert(!V.isAnalyzed() && "Value has already been analyzed!");
-  VNInfo *VNI = LI.getValNumInfo(ValNo);
+  VNInfo *VNI = LR.getValNumInfo(ValNo);
   if (VNI->isUnused()) {
     V.WriteLanes = ~0u;
     return CR_Keep;
@@ -1503,46 +1849,56 @@
   const MachineInstr *DefMI = nullptr;
   if (VNI->isPHIDef()) {
     // Conservatively assume that all lanes in a PHI are valid.
-    V.ValidLanes = V.WriteLanes = TRI->getSubRegIndexLaneMask(SubIdx);
+    unsigned Lanes = SubRangeJoin ? 1 : TRI->getSubRegIndexLaneMask(SubIdx);
+    V.ValidLanes = V.WriteLanes = Lanes;
   } else {
     DefMI = Indexes->getInstructionFromIndex(VNI->def);
-    bool Redef = false;
-    V.ValidLanes = V.WriteLanes = computeWriteLanes(DefMI, Redef);
+    assert(DefMI != nullptr);
+    if (SubRangeJoin) {
+      // We don't care about the lanes when joining subregister ranges.
+      V.ValidLanes = V.WriteLanes = 1;
+    } else {
+      bool Redef = false;
+      V.ValidLanes = V.WriteLanes = computeWriteLanes(DefMI, Redef);
 
-    // If this is a read-modify-write instruction, there may be more valid
-    // lanes than the ones written by this instruction.
-    // This only covers partial redef operands. DefMI may have normal use
-    // operands reading the register. They don't contribute valid lanes.
-    //
-    // This adds ssub1 to the set of valid lanes in %src:
-    //
-    //   %src:ssub1<def> = FOO
-    //
-    // This leaves only ssub1 valid, making any other lanes undef:
-    //
-    //   %src:ssub1<def,read-undef> = FOO %src:ssub2
-    //
-    // The <read-undef> flag on the def operand means that old lane values are
-    // not important.
-    if (Redef) {
-      V.RedefVNI = LI.Query(VNI->def).valueIn();
-      assert(V.RedefVNI && "Instruction is reading nonexistent value");
-      computeAssignment(V.RedefVNI->id, Other);
-      V.ValidLanes |= Vals[V.RedefVNI->id].ValidLanes;
-    }
+      // If this is a read-modify-write instruction, there may be more valid
+      // lanes than the ones written by this instruction.
+      // This only covers partial redef operands. DefMI may have normal use
+      // operands reading the register. They don't contribute valid lanes.
+      //
+      // This adds ssub1 to the set of valid lanes in %src:
+      //
+      //   %src:ssub1<def> = FOO
+      //
+      // This leaves only ssub1 valid, making any other lanes undef:
+      //
+      //   %src:ssub1<def,read-undef> = FOO %src:ssub2
+      //
+      // The <read-undef> flag on the def operand means that old lane values are
+      // not important.
+      if (Redef) {
+        V.RedefVNI = LR.Query(VNI->def).valueIn();
+        assert((TrackSubRegLiveness || V.RedefVNI) &&
+               "Instruction is reading nonexistent value");
+        if (V.RedefVNI != nullptr) {
+          computeAssignment(V.RedefVNI->id, Other);
+          V.ValidLanes |= Vals[V.RedefVNI->id].ValidLanes;
+        }
+      }
 
-    // An IMPLICIT_DEF writes undef values.
-    if (DefMI->isImplicitDef()) {
-      // We normally expect IMPLICIT_DEF values to be live only until the end
-      // of their block. If the value is really live longer and gets pruned in
-      // another block, this flag is cleared again.
-      V.ErasableImplicitDef = true;
-      V.ValidLanes &= ~V.WriteLanes;
+      // An IMPLICIT_DEF writes undef values.
+      if (DefMI->isImplicitDef()) {
+        // We normally expect IMPLICIT_DEF values to be live only until the end
+        // of their block. If the value is really live longer and gets pruned in
+        // another block, this flag is cleared again.
+        V.ErasableImplicitDef = true;
+        V.ValidLanes &= ~V.WriteLanes;
+      }
     }
   }
 
   // Find the value in Other that overlaps VNI->def, if any.
-  LiveQueryResult OtherLRQ = Other.LI.Query(VNI->def);
+  LiveQueryResult OtherLRQ = Other.LR.Query(VNI->def);
 
   // It is possible that both values are defined by the same instruction, or
   // the values are PHIs defined in the same block. When that happens, the two
@@ -1612,8 +1968,14 @@
     return CR_Replace;
 
   // Check for simple erasable conflicts.
-  if (DefMI->isImplicitDef())
+  if (DefMI->isImplicitDef()) {
+    // We need the def for the subregister if there is nothing else live at the
+    // subrange at this point.
+    if (TrackSubRegLiveness
+        && (V.WriteLanes & (OtherV.ValidLanes | OtherV.WriteLanes)) == 0)
+      return CR_Replace;
     return CR_Erase;
+  }
 
   // Include the non-conflict where DefMI is a coalescable copy that kills
   // OtherVNI. We still want the copy erased and value numbers merged.
@@ -1634,8 +1996,8 @@
   //   %other = COPY %ext
   //   %this  = COPY %ext <-- Erase this copy
   //
-  if (DefMI->isFullCopy() && !CP.isPartial() &&
-      stripCopies(VNI) == stripCopies(V.OtherVNI))
+  if (DefMI->isFullCopy() && !CP.isPartial()
+      && valuesIdentical(VNI, V.OtherVNI, Other))
     return CR_Erase;
 
   // If the lanes written by this instruction were all undef in OtherVNI, it is
@@ -1670,7 +2032,7 @@
   // VNI is clobbering live lanes in OtherVNI, but there is still the
   // possibility that no instructions actually read the clobbered lanes.
   // If we're clobbering all the lanes in OtherVNI, at least one must be read.
-  // Otherwise Other.LI wouldn't be live here.
+  // Otherwise Other.RI wouldn't be live here.
   if ((TRI->getSubRegIndexLaneMask(Other.SubIdx) & ~V.WriteLanes) == 0)
     return CR_Impossible;
 
@@ -1691,9 +2053,6 @@
   return CR_Unresolved;
 }
 
-/// Compute the value assignment for ValNo in LI.
-/// This may be called recursively by analyzeValue(), but never for a ValNo on
-/// the stack.
 void JoinVals::computeAssignment(unsigned ValNo, JoinVals &Other) {
   Val &V = Vals[ValNo];
   if (V.isAnalyzed()) {
@@ -1709,73 +2068,64 @@
     assert(V.OtherVNI && "OtherVNI not assigned, can't merge.");
     assert(Other.Vals[V.OtherVNI->id].isAnalyzed() && "Missing recursion");
     Assignments[ValNo] = Other.Assignments[V.OtherVNI->id];
-    DEBUG(dbgs() << "\t\tmerge " << PrintReg(LI.reg) << ':' << ValNo << '@'
-                 << LI.getValNumInfo(ValNo)->def << " into "
-                 << PrintReg(Other.LI.reg) << ':' << V.OtherVNI->id << '@'
+    DEBUG(dbgs() << "\t\tmerge " << PrintReg(Reg) << ':' << ValNo << '@'
+                 << LR.getValNumInfo(ValNo)->def << " into "
+                 << PrintReg(Other.Reg) << ':' << V.OtherVNI->id << '@'
                  << V.OtherVNI->def << " --> @"
                  << NewVNInfo[Assignments[ValNo]]->def << '\n');
     break;
   case CR_Replace:
-  case CR_Unresolved:
+  case CR_Unresolved: {
     // The other value is going to be pruned if this join is successful.
     assert(V.OtherVNI && "OtherVNI not assigned, can't prune");
-    Other.Vals[V.OtherVNI->id].Pruned = true;
+    Val &OtherV = Other.Vals[V.OtherVNI->id];
+    // We cannot erase an IMPLICIT_DEF if we don't have valid values for all
+    // its lanes.
+    if ((OtherV.WriteLanes & ~V.ValidLanes) != 0 && TrackSubRegLiveness)
+      OtherV.ErasableImplicitDef = false;
+    OtherV.Pruned = true;
+  }
     // Fall through.
   default:
     // This value number needs to go in the final joined live range.
     Assignments[ValNo] = NewVNInfo.size();
-    NewVNInfo.push_back(LI.getValNumInfo(ValNo));
+    NewVNInfo.push_back(LR.getValNumInfo(ValNo));
     break;
   }
 }
 
 bool JoinVals::mapValues(JoinVals &Other) {
-  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
     computeAssignment(i, Other);
     if (Vals[i].Resolution == CR_Impossible) {
-      DEBUG(dbgs() << "\t\tinterference at " << PrintReg(LI.reg) << ':' << i
-                   << '@' << LI.getValNumInfo(i)->def << '\n');
+      DEBUG(dbgs() << "\t\tinterference at " << PrintReg(Reg) << ':' << i
+                   << '@' << LR.getValNumInfo(i)->def << '\n');
       return false;
     }
   }
   return true;
 }
 
-/// Assuming ValNo is going to clobber some valid lanes in Other.LI, compute
-/// the extent of the tainted lanes in the block.
-///
-/// Multiple values in Other.LI can be affected since partial redefinitions can
-/// preserve previously tainted lanes.
-///
-///   1 %dst = VLOAD           <-- Define all lanes in %dst
-///   2 %src = FOO             <-- ValNo to be joined with %dst:ssub0
-///   3 %dst:ssub1 = BAR       <-- Partial redef doesn't clear taint in ssub0
-///   4 %dst:ssub0 = COPY %src <-- Conflict resolved, ssub0 wasn't read
-///
-/// For each ValNo in Other that is affected, add an (EndIndex, TaintedLanes)
-/// entry to TaintedVals.
-///
-/// Returns false if the tainted lanes extend beyond the basic block.
 bool JoinVals::
 taintExtent(unsigned ValNo, unsigned TaintedLanes, JoinVals &Other,
             SmallVectorImpl<std::pair<SlotIndex, unsigned> > &TaintExtent) {
-  VNInfo *VNI = LI.getValNumInfo(ValNo);
+  VNInfo *VNI = LR.getValNumInfo(ValNo);
   MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
   SlotIndex MBBEnd = Indexes->getMBBEndIdx(MBB);
 
-  // Scan Other.LI from VNI.def to MBBEnd.
-  LiveInterval::iterator OtherI = Other.LI.find(VNI->def);
-  assert(OtherI != Other.LI.end() && "No conflict?");
+  // Scan Other.LR from VNI.def to MBBEnd.
+  LiveInterval::iterator OtherI = Other.LR.find(VNI->def);
+  assert(OtherI != Other.LR.end() && "No conflict?");
   do {
     // OtherI is pointing to a tainted value. Abort the join if the tainted
     // lanes escape the block.
     SlotIndex End = OtherI->end;
     if (End >= MBBEnd) {
-      DEBUG(dbgs() << "\t\ttaints global " << PrintReg(Other.LI.reg) << ':'
+      DEBUG(dbgs() << "\t\ttaints global " << PrintReg(Other.Reg) << ':'
                    << OtherI->valno->id << '@' << OtherI->start << '\n');
       return false;
     }
-    DEBUG(dbgs() << "\t\ttaints local " << PrintReg(Other.LI.reg) << ':'
+    DEBUG(dbgs() << "\t\ttaints local " << PrintReg(Other.Reg) << ':'
                  << OtherI->valno->id << '@' << OtherI->start
                  << " to " << End << '\n');
     // A dead def is not a problem.
@@ -1784,7 +2134,7 @@
     TaintExtent.push_back(std::make_pair(End, TaintedLanes));
 
     // Check for another def in the MBB.
-    if (++OtherI == Other.LI.end() || OtherI->start >= MBBEnd)
+    if (++OtherI == Other.LR.end() || OtherI->start >= MBBEnd)
       break;
 
     // Lanes written by the new def are no longer tainted.
@@ -1796,10 +2146,8 @@
   return true;
 }
 
-/// Return true if MI uses any of the given Lanes from Reg.
-/// This does not include partial redefinitions of Reg.
-bool JoinVals::usesLanes(MachineInstr *MI, unsigned Reg, unsigned SubIdx,
-                         unsigned Lanes) {
+bool JoinVals::usesLanes(const MachineInstr *MI, unsigned Reg, unsigned SubIdx,
+                         unsigned Lanes) const {
   if (MI->isDebugValue())
     return false;
   for (ConstMIOperands MO(MI); MO.isValid(); ++MO) {
@@ -1815,16 +2163,19 @@
 }
 
 bool JoinVals::resolveConflicts(JoinVals &Other) {
-  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
     Val &V = Vals[i];
     assert (V.Resolution != CR_Impossible && "Unresolvable conflict");
     if (V.Resolution != CR_Unresolved)
       continue;
-    DEBUG(dbgs() << "\t\tconflict at " << PrintReg(LI.reg) << ':' << i
-                 << '@' << LI.getValNumInfo(i)->def << '\n');
+    DEBUG(dbgs() << "\t\tconflict at " << PrintReg(Reg) << ':' << i
+                 << '@' << LR.getValNumInfo(i)->def << '\n');
+    if (SubRangeJoin)
+      return false;
+
     ++NumLaneConflicts;
     assert(V.OtherVNI && "Inconsistent conflict resolution.");
-    VNInfo *VNI = LI.getValNumInfo(i);
+    VNInfo *VNI = LR.getValNumInfo(i);
     const Val &OtherV = Other.Vals[V.OtherVNI->id];
 
     // VNI is known to clobber some lanes in OtherVNI. If we go ahead with the
@@ -1854,7 +2205,7 @@
     unsigned TaintNum = 0;
     for(;;) {
       assert(MI != MBB->end() && "Bad LastMI");
-      if (usesLanes(MI, Other.LI.reg, Other.SubIdx, TaintedLanes)) {
+      if (usesLanes(MI, Other.Reg, Other.SubIdx, TaintedLanes)) {
         DEBUG(dbgs() << "\t\ttainted lanes used by: " << *MI);
         return false;
       }
@@ -1876,13 +2227,6 @@
   return true;
 }
 
-// Determine if ValNo is a copy of a value number in LI or Other.LI that will
-// be pruned:
-//
-//   %dst = COPY %src
-//   %src = COPY %dst  <-- This value to be pruned.
-//   %dst = COPY %src  <-- This value is a copy of a pruned value.
-//
 bool JoinVals::isPrunedValue(unsigned ValNo, JoinVals &Other) {
   Val &V = Vals[ValNo];
   if (V.Pruned || V.PrunedComputed)
@@ -1899,15 +2243,16 @@
 }
 
 void JoinVals::pruneValues(JoinVals &Other,
-                           SmallVectorImpl<SlotIndex> &EndPoints) {
-  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
-    SlotIndex Def = LI.getValNumInfo(i)->def;
+                           SmallVectorImpl<SlotIndex> &EndPoints,
+                           bool changeInstrs) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    SlotIndex Def = LR.getValNumInfo(i)->def;
     switch (Vals[i].Resolution) {
     case CR_Keep:
       break;
     case CR_Replace: {
-      // This value takes precedence over the value in Other.LI.
-      LIS->pruneValue(&Other.LI, Def, &EndPoints);
+      // This value takes precedence over the value in Other.LR.
+      LIS->pruneValue(Other.LR, Def, &EndPoints);
       // Check if we're replacing an IMPLICIT_DEF value. The IMPLICIT_DEF
       // instructions are only inserted to provide a live-out value for PHI
       // predecessors, so the instruction should simply go away once its value
@@ -1916,34 +2261,37 @@
       bool EraseImpDef = OtherV.ErasableImplicitDef &&
                          OtherV.Resolution == CR_Keep;
       if (!Def.isBlock()) {
-        // Remove <def,read-undef> flags. This def is now a partial redef.
-        // Also remove <def,dead> flags since the joined live range will
-        // continue past this instruction.
-        for (MIOperands MO(Indexes->getInstructionFromIndex(Def));
-             MO.isValid(); ++MO)
-          if (MO->isReg() && MO->isDef() && MO->getReg() == LI.reg) {
-            MO->setIsUndef(EraseImpDef);
-            MO->setIsDead(false);
+        if (changeInstrs) {
+          // Remove <def,read-undef> flags. This def is now a partial redef.
+          // Also remove <def,dead> flags since the joined live range will
+          // continue past this instruction.
+          for (MIOperands MO(Indexes->getInstructionFromIndex(Def));
+               MO.isValid(); ++MO) {
+            if (MO->isReg() && MO->isDef() && MO->getReg() == Reg) {
+              MO->setIsUndef(EraseImpDef);
+              MO->setIsDead(false);
+            }
           }
+        }
         // This value will reach instructions below, but we need to make sure
         // the live range also reaches the instruction at Def.
         if (!EraseImpDef)
           EndPoints.push_back(Def);
       }
-      DEBUG(dbgs() << "\t\tpruned " << PrintReg(Other.LI.reg) << " at " << Def
-                   << ": " << Other.LI << '\n');
+      DEBUG(dbgs() << "\t\tpruned " << PrintReg(Other.Reg) << " at " << Def
+                   << ": " << Other.LR << '\n');
       break;
     }
     case CR_Erase:
     case CR_Merge:
       if (isPrunedValue(i, Other)) {
-        // This value is ultimately a copy of a pruned value in LI or Other.LI.
+        // This value is ultimately a copy of a pruned value in LR or Other.LR.
         // We can no longer trust the value mapping computed by
         // computeAssignment(), the value that was originally copied could have
         // been replaced.
-        LIS->pruneValue(&LI, Def, &EndPoints);
-        DEBUG(dbgs() << "\t\tpruned all of " << PrintReg(LI.reg) << " at "
-                     << Def << ": " << LI << '\n');
+        LIS->pruneValue(LR, Def, &EndPoints);
+        DEBUG(dbgs() << "\t\tpruned all of " << PrintReg(Reg) << " at "
+                     << Def << ": " << LR << '\n');
       }
       break;
     case CR_Unresolved:
@@ -1953,25 +2301,65 @@
   }
 }
 
+void JoinVals::pruneSubRegValues(LiveInterval &LI, unsigned &ShrinkMask)
+{
+  // Look for values being erased.
+  bool DidPrune = false;
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    if (Vals[i].Resolution != CR_Erase)
+      continue;
+
+    // Check subranges at the point where the copy will be removed.
+    SlotIndex Def = LR.getValNumInfo(i)->def;
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      LiveQueryResult Q = S.Query(Def);
+
+      // If a subrange starts at the copy then an undefined value has been
+      // copied and we must remove that subrange value as well.
+      VNInfo *ValueOut = Q.valueOutOrDead();
+      if (ValueOut != nullptr && Q.valueIn() == nullptr) {
+        DEBUG(dbgs() << "\t\tPrune sublane " << format("%04X", S.LaneMask)
+                     << " at " << Def << "\n");
+        LIS->pruneValue(S, Def, nullptr);
+        DidPrune = true;
+        // Mark value number as unused.
+        ValueOut->markUnused();
+        continue;
+      }
+      // If a subrange ends at the copy, then a value was copied but only
+      // partially used later. Shrink the subregister range apropriately.
+      if (Q.valueIn() != nullptr && Q.valueOut() == nullptr) {
+        DEBUG(dbgs() << "\t\tDead uses at sublane "
+                     << format("%04X", S.LaneMask) << " at " << Def << "\n");
+        ShrinkMask |= S.LaneMask;
+      }
+    }
+  }
+  if (DidPrune)
+    LI.removeEmptySubRanges();
+}
+
 void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
                            SmallVectorImpl<unsigned> &ShrinkRegs) {
-  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
     // Get the def location before markUnused() below invalidates it.
-    SlotIndex Def = LI.getValNumInfo(i)->def;
+    SlotIndex Def = LR.getValNumInfo(i)->def;
     switch (Vals[i].Resolution) {
-    case CR_Keep:
+    case CR_Keep: {
       // If an IMPLICIT_DEF value is pruned, it doesn't serve a purpose any
       // longer. The IMPLICIT_DEF instructions are only inserted by
       // PHIElimination to guarantee that all PHI predecessors have a value.
       if (!Vals[i].ErasableImplicitDef || !Vals[i].Pruned)
         break;
-      // Remove value number i from LI. Note that this VNInfo is still present
-      // in NewVNInfo, so it will appear as an unused value number in the final
-      // joined interval.
-      LI.getValNumInfo(i)->markUnused();
-      LI.removeValNo(LI.getValNumInfo(i));
-      DEBUG(dbgs() << "\t\tremoved " << i << '@' << Def << ": " << LI << '\n');
+      // Remove value number i from LR.
+      VNInfo *VNI = LR.getValNumInfo(i);
+      LR.removeValNo(VNI);
+      // Note that this VNInfo is reused and still referenced in NewVNInfo,
+      // make it appear like an unused value number.
+      VNI->markUnused();
+      DEBUG(dbgs() << "\t\tremoved " << i << '@' << Def << ": " << LR << '\n');
       // FALL THROUGH.
+    }
 
     case CR_Erase: {
       MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
@@ -1994,12 +2382,96 @@
   }
 }
 
+void RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
+                                         unsigned LaneMask,
+                                         const CoalescerPair &CP) {
+  SmallVector<VNInfo*, 16> NewVNInfo;
+  JoinVals RHSVals(RRange, CP.getSrcReg(), CP.getSrcIdx(), LaneMask,
+                   NewVNInfo, CP, LIS, TRI, true, true);
+  JoinVals LHSVals(LRange, CP.getDstReg(), CP.getDstIdx(), LaneMask,
+                   NewVNInfo, CP, LIS, TRI, true, true);
+
+  // Compute NewVNInfo and resolve conflicts (see also joinVirtRegs())
+  // Conflicts should already be resolved so the mapping/resolution should
+  // always succeed.
+  if (!LHSVals.mapValues(RHSVals) || !RHSVals.mapValues(LHSVals))
+    llvm_unreachable("Can't join subrange although main ranges are compatible");
+  if (!LHSVals.resolveConflicts(RHSVals) || !RHSVals.resolveConflicts(LHSVals))
+    llvm_unreachable("Can't join subrange although main ranges are compatible");
+
+  // The merging algorithm in LiveInterval::join() can't handle conflicting
+  // value mappings, so we need to remove any live ranges that overlap a
+  // CR_Replace resolution. Collect a set of end points that can be used to
+  // restore the live range after joining.
+  SmallVector<SlotIndex, 8> EndPoints;
+  LHSVals.pruneValues(RHSVals, EndPoints, false);
+  RHSVals.pruneValues(LHSVals, EndPoints, false);
+
+  LRange.verify();
+  RRange.verify();
+
+  // Join RRange into LHS.
+  LRange.join(RRange, LHSVals.getAssignments(), RHSVals.getAssignments(),
+              NewVNInfo);
+
+  DEBUG(dbgs() << "\t\tjoined lanes: " << LRange << "\n");
+  if (EndPoints.empty())
+    return;
+
+  // Recompute the parts of the live range we had to remove because of
+  // CR_Replace conflicts.
+  DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size()
+               << " points: " << LRange << '\n');
+  LIS->extendToIndices(LRange, EndPoints);
+}
+
+void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI,
+                                          const LiveRange &ToMerge,
+                                          unsigned LaneMask, CoalescerPair &CP) {
+  BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+  for (LiveInterval::SubRange &R : LI.subranges()) {
+    unsigned RMask = R.LaneMask;
+    // LaneMask of subregisters common to subrange R and ToMerge.
+    unsigned Common = RMask & LaneMask;
+    // There is nothing to do without common subregs.
+    if (Common == 0)
+      continue;
+
+    DEBUG(dbgs() << format("\t\tCopy+Merge %04X into %04X\n", RMask, Common));
+    // LaneMask of subregisters contained in the R range but not in ToMerge,
+    // they have to split into their own subrange.
+    unsigned LRest = RMask & ~LaneMask;
+    LiveInterval::SubRange *CommonRange;
+    if (LRest != 0) {
+      R.LaneMask = LRest;
+      DEBUG(dbgs() << format("\t\tReduce Lane to %04X\n", LRest));
+      // Duplicate SubRange for newly merged common stuff.
+      CommonRange = LI.createSubRangeFrom(Allocator, Common, R);
+    } else {
+      // Reuse the existing range.
+      R.LaneMask = Common;
+      CommonRange = &R;
+    }
+    LiveRange RangeCopy(ToMerge, Allocator);
+    joinSubRegRanges(*CommonRange, RangeCopy, Common, CP);
+    LaneMask &= ~RMask;
+  }
+
+  if (LaneMask != 0) {
+    DEBUG(dbgs() << format("\t\tNew Lane %04X\n", LaneMask));
+    LI.createSubRangeFrom(Allocator, LaneMask, ToMerge);
+  }
+}
+
 bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
   SmallVector<VNInfo*, 16> NewVNInfo;
   LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
   LiveInterval &LHS = LIS->getInterval(CP.getDstReg());
-  JoinVals RHSVals(RHS, CP.getSrcIdx(), NewVNInfo, CP, LIS, TRI);
-  JoinVals LHSVals(LHS, CP.getDstIdx(), NewVNInfo, CP, LIS, TRI);
+  bool TrackSubRegLiveness = MRI->tracksSubRegLiveness();
+  JoinVals RHSVals(RHS, CP.getSrcReg(), CP.getSrcIdx(), 0, NewVNInfo, CP, LIS,
+                   TRI, false, TrackSubRegLiveness);
+  JoinVals LHSVals(LHS, CP.getDstReg(), CP.getDstIdx(), 0, NewVNInfo, CP, LIS,
+                   TRI, false, TrackSubRegLiveness);
 
   DEBUG(dbgs() << "\t\tRHS = " << RHS
                << "\n\t\tLHS = " << LHS
@@ -2015,14 +2487,55 @@
     return false;
 
   // All clear, the live ranges can be merged.
+  if (RHS.hasSubRanges() || LHS.hasSubRanges()) {
+    BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+
+    // Transform lanemasks from the LHS to masks in the coalesced register and
+    // create initial subranges if necessary.
+    unsigned DstIdx = CP.getDstIdx();
+    if (!LHS.hasSubRanges()) {
+      unsigned Mask = DstIdx == 0 ? CP.getNewRC()->getLaneMask()
+                                  : TRI->getSubRegIndexLaneMask(DstIdx);
+      // LHS must support subregs or we wouldn't be in this codepath.
+      assert(Mask != 0);
+      LHS.createSubRangeFrom(Allocator, Mask, LHS);
+    } else if (DstIdx != 0) {
+      // Transform LHS lanemasks to new register class if necessary.
+      for (LiveInterval::SubRange &R : LHS.subranges()) {
+        unsigned Mask = TRI->composeSubRegIndexLaneMask(DstIdx, R.LaneMask);
+        R.LaneMask = Mask;
+      }
+    }
+    DEBUG(dbgs() << "\t\tLHST = " << PrintReg(CP.getDstReg())
+                 << ' ' << LHS << '\n');
+
+    // Determine lanemasks of RHS in the coalesced register and merge subranges.
+    unsigned SrcIdx = CP.getSrcIdx();
+    if (!RHS.hasSubRanges()) {
+      unsigned Mask = SrcIdx == 0 ? CP.getNewRC()->getLaneMask()
+                                  : TRI->getSubRegIndexLaneMask(SrcIdx);
+      mergeSubRangeInto(LHS, RHS, Mask, CP);
+    } else {
+      // Pair up subranges and merge.
+      for (LiveInterval::SubRange &R : RHS.subranges()) {
+        unsigned Mask = TRI->composeSubRegIndexLaneMask(SrcIdx, R.LaneMask);
+        mergeSubRangeInto(LHS, R, Mask, CP);
+      }
+    }
+
+    DEBUG(dbgs() << "\tJoined SubRanges " << LHS << "\n");
+
+    LHSVals.pruneSubRegValues(LHS, ShrinkMask);
+    RHSVals.pruneSubRegValues(LHS, ShrinkMask);
+  }
 
   // The merging algorithm in LiveInterval::join() can't handle conflicting
   // value mappings, so we need to remove any live ranges that overlap a
   // CR_Replace resolution. Collect a set of end points that can be used to
   // restore the live range after joining.
   SmallVector<SlotIndex, 8> EndPoints;
-  LHSVals.pruneValues(RHSVals, EndPoints);
-  RHSVals.pruneValues(LHSVals, EndPoints);
+  LHSVals.pruneValues(RHSVals, EndPoints, true);
+  RHSVals.pruneValues(LHSVals, EndPoints, true);
 
   // Erase COPY and IMPLICIT_DEF instructions. This may cause some external
   // registers to require trimming.
@@ -2041,24 +2554,23 @@
   MRI->clearKillFlags(LHS.reg);
   MRI->clearKillFlags(RHS.reg);
 
-  if (EndPoints.empty())
-    return true;
+  if (!EndPoints.empty()) {
+    // Recompute the parts of the live range we had to remove because of
+    // CR_Replace conflicts.
+    DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size()
+                 << " points: " << LHS << '\n');
+    LIS->extendToIndices((LiveRange&)LHS, EndPoints);
+  }
 
-  // Recompute the parts of the live range we had to remove because of
-  // CR_Replace conflicts.
-  DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size()
-               << " points: " << LHS << '\n');
-  LIS->extendToIndices(LHS, EndPoints);
   return true;
 }
 
-/// Attempt to join these two intervals.  On failure, this returns false.
 bool RegisterCoalescer::joinIntervals(CoalescerPair &CP) {
   return CP.isPhys() ? joinReservedPhysReg(CP) : joinVirtRegs(CP);
 }
 
 namespace {
-// Information concerning MBB coalescing priority.
+/// Information concerning MBB coalescing priority.
 struct MBBPriorityInfo {
   MachineBasicBlock *MBB;
   unsigned Depth;
@@ -2069,10 +2581,10 @@
 };
 }
 
-// C-style comparator that sorts first based on the loop depth of the basic
-// block (the unsigned), and then on the MBB number.
-//
-// EnableGlobalCopies assumes that the primary sort key is loop depth.
+/// C-style comparator that sorts first based on the loop depth of the basic
+/// block (the unsigned), and then on the MBB number.
+///
+/// EnableGlobalCopies assumes that the primary sort key is loop depth.
 static int compareMBBPriority(const MBBPriorityInfo *LHS,
                               const MBBPriorityInfo *RHS) {
   // Deeper loops first
@@ -2112,8 +2624,6 @@
     || LIS->intervalIsInOneMBB(LIS->getInterval(DstReg));
 }
 
-// Try joining WorkList copies starting from index From.
-// Null out any successful joins.
 bool RegisterCoalescer::
 copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
   bool Progress = false;
@@ -2224,15 +2734,14 @@
   MF = &fn;
   MRI = &fn.getRegInfo();
   TM = &fn.getTarget();
-  TRI = TM->getSubtargetImpl()->getRegisterInfo();
-  TII = TM->getSubtargetImpl()->getInstrInfo();
+  const TargetSubtargetInfo &STI = fn.getSubtarget();
+  TRI = STI.getRegisterInfo();
+  TII = STI.getInstrInfo();
   LIS = &getAnalysis<LiveIntervals>();
   AA = &getAnalysis<AliasAnalysis>();
   Loops = &getAnalysis<MachineLoopInfo>();
-
-  const TargetSubtargetInfo &ST = TM->getSubtarget<TargetSubtargetInfo>();
   if (EnableGlobalCopies == cl::BOU_UNSET)
-    JoinGlobalCopies = ST.useMachineScheduler();
+    JoinGlobalCopies = STI.useMachineScheduler();
   else
     JoinGlobalCopies = (EnableGlobalCopies == cl::BOU_TRUE);
 
@@ -2264,9 +2773,24 @@
     unsigned Reg = InflateRegs[i];
     if (MRI->reg_nodbg_empty(Reg))
       continue;
-    if (MRI->recomputeRegClass(Reg, *TM)) {
+    if (MRI->recomputeRegClass(Reg)) {
       DEBUG(dbgs() << PrintReg(Reg) << " inflated to "
                    << TRI->getRegClassName(MRI->getRegClass(Reg)) << '\n');
+      LiveInterval &LI = LIS->getInterval(Reg);
+      unsigned MaxMask = MRI->getMaxLaneMaskForVReg(Reg);
+      if (MaxMask == 0) {
+        // If the inflated register class does not support subregisters anymore
+        // remove the subranges.
+        LI.clearSubRanges();
+      } else {
+#ifndef NDEBUG
+        // If subranges are still supported, then the same subregs should still
+        // be supported.
+        for (LiveInterval::SubRange &S : LI.subranges()) {
+          assert ((S.LaneMask & ~MaxMask) == 0);
+        }
+#endif
+      }
       ++NumInflated;
     }
   }
@@ -2277,7 +2801,6 @@
   return true;
 }
 
-/// Implement the dump method.
 void RegisterCoalescer::print(raw_ostream &O, const Module* m) const {
    LIS->print(O, m);
 }

diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index 6f8b337..76a7fef 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp

@@ -36,8 +36,8 @@
 void SchedulingPriorityQueue::anchor() { }
 
 ScheduleDAG::ScheduleDAG(MachineFunction &mf)
-    : TM(mf.getTarget()), TII(TM.getSubtargetImpl()->getInstrInfo()),
-      TRI(TM.getSubtargetImpl()->getRegisterInfo()), MF(mf),
+    : TM(mf.getTarget()), TII(mf.getSubtarget().getInstrInfo()),
+      TRI(mf.getSubtarget().getRegisterInfo()), MF(mf),
       MRI(mf.getRegInfo()), EntrySU(), ExitSU() {
 #ifndef NDEBUG
   StressSched = StressSchedOpt;

diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index dece643..78bfd23 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp

@@ -44,25 +44,24 @@
 
 static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
     cl::ZeroOrMore, cl::init(false),
-    cl::desc("Enable use of AA during MI GAD construction"));
+    cl::desc("Enable use of AA during MI DAG construction"));
 
 static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden,
-    cl::init(true), cl::desc("Enable use of TBAA during MI GAD construction"));
+    cl::init(true), cl::desc("Enable use of TBAA during MI DAG construction"));
 
 ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
                                      const MachineLoopInfo *mli,
-                                     bool IsPostRAFlag,
-                                     bool RemoveKillFlags,
+                                     bool IsPostRAFlag, bool RemoveKillFlags,
                                      LiveIntervals *lis)
-  : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), LIS(lis),
-    IsPostRA(IsPostRAFlag), RemoveKillFlags(RemoveKillFlags),
-    CanHandleTerminators(false), FirstDbgValue(nullptr) {
+    : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()), LIS(lis),
+      IsPostRA(IsPostRAFlag), RemoveKillFlags(RemoveKillFlags),
+      CanHandleTerminators(false), FirstDbgValue(nullptr) {
   assert((IsPostRA || LIS) && "PreRA scheduling requires LiveIntervals");
   DbgValues.clear();
   assert(!(IsPostRA && MRI.getNumVirtRegs()) &&
          "Virtual registers must be removed prior to PostRA scheduling");
 
-  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+  const TargetSubtargetInfo &ST = mf.getSubtarget();
   SchedModel.init(ST.getSchedModel(), &ST, TII);
 }
 
@@ -253,7 +252,7 @@
   assert(MO.isDef() && "expect physreg def");
 
   // Ask the target if address-backscheduling is desirable, and if so how much.
-  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
 
   for (MCRegAliasIterator Alias(MO.getReg(), TRI, true);
        Alias.isValid(); ++Alias) {
@@ -444,7 +443,7 @@
       int DefOp = Def->findRegisterDefOperandIdx(Reg);
       dep.setLatency(SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx));
 
-      const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+      const TargetSubtargetInfo &ST = MF.getSubtarget();
       ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep));
       SU->addPred(dep);
     }
@@ -614,10 +613,10 @@
   }
   // Track current depth.
   (*Depth)++;
-  // Iterate over chain dependencies only.
+  // Iterate over memory dependencies only.
   for (SUnit::const_succ_iterator I = SUb->Succs.begin(), E = SUb->Succs.end();
        I != E; ++I)
-    if (I->isCtrl())
+    if (I->isNormalMemoryOrBarrier())
       iterateChainSucc (AA, MFI, SUa, I->getSUnit(), ExitSU, Depth, Visited);
   return *Depth;
 }
@@ -644,11 +643,12 @@
       Dep.setLatency(((*I)->getInstr()->mayLoad()) ? LatencyToLoad : 0);
       (*I)->addPred(Dep);
     }
-    // Now go through all the chain successors and iterate from them.
-    // Keep track of visited nodes.
+
+    // Iterate recursively over all previously added memory chain
+    // successors. Keep track of visited nodes.
     for (SUnit::const_succ_iterator J = (*I)->Succs.begin(),
          JE = (*I)->Succs.end(); J != JE; ++J)
-      if (J->isCtrl())
+      if (J->isNormalMemoryOrBarrier())
         iterateChainSucc (AA, MFI, SU, J->getSUnit(),
                           ExitSU, &Depth, Visited);
   }
@@ -742,7 +742,7 @@
 void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
                                         RegPressureTracker *RPTracker,
                                         PressureDiffs *PDiffs) {
-  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
   bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
                                                        : ST.useAA();
   AliasAnalysis *AAForDep = UseAA ? AA : nullptr;
@@ -891,7 +891,7 @@
 
       // fall-through
     new_alias_chain:
-      // Chain all possibly aliasing memory references though SU.
+      // Chain all possibly aliasing memory references through SU.
       if (AliasChain) {
         unsigned ChainLatency = 0;
         if (AliasChain->getInstr()->mayLoad())
@@ -991,11 +991,9 @@
         // Add dependence on alias chain, if needed.
         if (AliasChain)
           addChainDependency(AAForDep, MFI, SU, AliasChain, RejectMemNodes);
-        // But we also should check dependent instructions for the
-        // SU in question.
-        adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
-                        TrueMemOrderLatency);
       }
+      adjustChainDeps(AA, MFI, SU, &ExitSU, RejectMemNodes,
+                      TrueMemOrderLatency);
     } else if (MI->mayLoad()) {
       bool MayAlias = true;
       if (MI->isInvariantLoad(AA)) {

diff --git a/lib/CodeGen/SelectionDAG/Android.mk b/lib/CodeGen/SelectionDAG/Android.mk
index 0e52ee3..9501ad9 100644
--- a/lib/CodeGen/SelectionDAG/Android.mk
+++ b/lib/CodeGen/SelectionDAG/Android.mk

@@ -22,6 +22,7 @@
   SelectionDAGDumper.cpp \
   SelectionDAGISel.cpp \
   SelectionDAGPrinter.cpp \
+  StatepointLowering.cpp \
   TargetLowering.cpp \
   TargetSelectionDAGInfo.cpp
 

diff --git a/lib/CodeGen/SelectionDAG/CMakeLists.txt b/lib/CodeGen/SelectionDAG/CMakeLists.txt
index 75e8167..fbedf2c 100644
--- a/lib/CodeGen/SelectionDAG/CMakeLists.txt
+++ b/lib/CodeGen/SelectionDAG/CMakeLists.txt

@@ -19,6 +19,7 @@
   SelectionDAGDumper.cpp
   SelectionDAGISel.cpp
   SelectionDAGPrinter.cpp
+  StatepointLowering.cpp
   ScheduleDAGVLIW.cpp
   TargetLowering.cpp
   TargetSelectionDAGInfo.cpp

diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a1291ed..6129401 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

@@ -17,9 +17,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -303,6 +303,8 @@
     SDValue visitEXTRACT_SUBVECTOR(SDNode *N);
     SDValue visitVECTOR_SHUFFLE(SDNode *N);
     SDValue visitINSERT_SUBVECTOR(SDNode *N);
+    SDValue visitMLOAD(SDNode *N);
+    SDValue visitMSTORE(SDNode *N);
 
     SDValue XformToShuffleWithZero(SDNode *N);
     SDValue ReassociateOps(unsigned Opc, SDLoc DL, SDValue LHS, SDValue RHS);
@@ -325,6 +327,7 @@
     SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
                                          unsigned HiOp);
     SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
+    SDValue CombineExtLoad(SDNode *N);
     SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
     SDValue BuildSDIV(SDNode *N);
     SDValue BuildSDIVPow2(SDNode *N);
@@ -361,6 +364,28 @@
     /// chain (aliasing node.)
     SDValue FindBetterChain(SDNode *N, SDValue Chain);
 
+    /// Holds a pointer to an LSBaseSDNode as well as information on where it
+    /// is located in a sequence of memory operations connected by a chain.
+    struct MemOpLink {
+      MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq):
+      MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { }
+      // Ptr to the mem node.
+      LSBaseSDNode *MemNode;
+      // Offset from the base ptr.
+      int64_t OffsetFromBase;
+      // What is the sequence number of this mem node.
+      // Lowest mem operand in the DAG starts at zero.
+      unsigned SequenceNum;
+    };
+
+    /// This is a helper function for MergeConsecutiveStores. When the source
+    /// elements of the consecutive stores are all constants or all extracted
+    /// vector elements, try to merge them into one larger store.
+    /// \return True if a merged store was created.
+    bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                         EVT MemVT, unsigned NumElem,
+                                         bool IsConstantSrc, bool UseVector);
+
     /// Merge consecutive store operations into a wide store.
     /// This optimization uses wide integers or vectors when possible.
     /// \return True if some memory operations were changed.
@@ -378,12 +403,9 @@
     DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
         : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
           OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
-      AttributeSet FnAttrs =
-          DAG.getMachineFunction().getFunction()->getAttributes();
-      ForCodeSize =
-          FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-                               Attribute::OptimizeForSize) ||
-          FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+      auto *F = DAG.getMachineFunction().getFunction();
+      ForCodeSize = F->hasFnAttribute(Attribute::OptimizeForSize) ||
+                    F->hasFnAttribute(Attribute::MinSize);
     }
 
     /// Runs the dag combiner on all nodes in the work list
@@ -444,7 +466,7 @@
 }
 
 SDValue TargetLowering::DAGCombinerInfo::
-CombineTo(SDNode *N, const std::vector<SDValue> &To, bool AddTo) {
+CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo) {
   return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo);
 }
 
@@ -736,10 +758,9 @@
     if (SDNode *L = isConstantBuildVectorOrConstantInt(N0.getOperand(1))) {
       if (SDNode *R = isConstantBuildVectorOrConstantInt(N1)) {
         // reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2))
-        SDValue OpNode = DAG.FoldConstantArithmetic(Opc, VT, L, R);
-        if (!OpNode.getNode())
-          return SDValue();
-        return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
+        if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, VT, L, R))
+          return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
+        return SDValue();
       }
       if (N0.hasOneUse()) {
         // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one
@@ -757,10 +778,9 @@
     if (SDNode *R = isConstantBuildVectorOrConstantInt(N1.getOperand(1))) {
       if (SDNode *L = isConstantBuildVectorOrConstantInt(N0)) {
         // reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2))
-        SDValue OpNode = DAG.FoldConstantArithmetic(Opc, VT, R, L);
-        if (!OpNode.getNode())
-          return SDValue();
-        return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode);
+        if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, VT, R, L))
+          return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode);
+        return SDValue();
       }
       if (N1.hasOneUse()) {
         // reassoc. (op y, (op x, c1)) -> (op (op x, y), c1) iff x+c1 has one
@@ -785,11 +805,12 @@
         N->dump(&DAG);
         dbgs() << "\nWith: ";
         To[0].getNode()->dump(&DAG);
-        dbgs() << " and " << NumTo-1 << " other values\n";
-        for (unsigned i = 0, e = NumTo; i != e; ++i)
-          assert((!To[i].getNode() ||
-                  N->getValueType(i) == To[i].getValueType()) &&
-                 "Cannot combine value to value of different type!"));
+        dbgs() << " and " << NumTo-1 << " other values\n");
+  for (unsigned i = 0, e = NumTo; i != e; ++i)
+    assert((!To[i].getNode() ||
+            N->getValueType(i) == To[i].getValueType()) &&
+           "Cannot combine value to value of different type!");
+
   WorklistRemover DeadNodes(*this);
   DAG.ReplaceAllUsesWith(N, To);
   if (AddTo) {
@@ -874,8 +895,8 @@
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) {
     EVT MemVT = LD->getMemoryVT();
     ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
-      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT) ? ISD::ZEXTLOAD
-                                                  : ISD::EXTLOAD)
+      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD
+                                                       : ISD::EXTLOAD)
       : LD->getExtensionType();
     Replace = true;
     return DAG.getExtLoad(ExtType, dl, PVT,
@@ -1096,8 +1117,8 @@
     LoadSDNode *LD = cast<LoadSDNode>(N);
     EVT MemVT = LD->getMemoryVT();
     ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
-      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT) ? ISD::ZEXTLOAD
-                                                  : ISD::EXTLOAD)
+      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD
+                                                       : ISD::EXTLOAD)
       : LD->getExtensionType();
     SDValue NewLD = DAG.getExtLoad(ExtType, dl, PVT,
                                    LD->getChain(), LD->getBasePtr(),
@@ -1160,10 +1181,8 @@
   LegalTypes = Level >= AfterLegalizeTypes;
 
   // Early exit if this basic block is in an optnone function.
-  AttributeSet FnAttrs =
-    DAG.getMachineFunction().getFunction()->getAttributes();
-  if (FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-                           Attribute::OptimizeNone))
+  if (DAG.getMachineFunction().getFunction()->hasFnAttribute(
+          Attribute::OptimizeNone))
     return;
 
   // Add all the dag nodes to the worklist.
@@ -1351,6 +1370,8 @@
   case ISD::EXTRACT_SUBVECTOR:  return visitEXTRACT_SUBVECTOR(N);
   case ISD::VECTOR_SHUFFLE:     return visitVECTOR_SHUFFLE(N);
   case ISD::INSERT_SUBVECTOR:   return visitINSERT_SUBVECTOR(N);
+  case ISD::MLOAD:              return visitMLOAD(N);
+  case ISD::MSTORE:             return visitMSTORE(N);
   }
   return SDValue();
 }
@@ -1475,7 +1496,7 @@
       switch (Op.getOpcode()) {
       case ISD::EntryToken:
         // Entry tokens don't need to be added to the list. They are
-        // rededundant.
+        // redundant.
         Changed = true;
         break;
 
@@ -1504,7 +1525,7 @@
 
   SDValue Result;
 
-  // If we've change things around then replace token factor.
+  // If we've changed things around then replace token factor.
   if (Changed) {
     if (Ops.empty()) {
       // The entry token is the only possible outcome.
@@ -1514,8 +1535,11 @@
       Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
     }
 
-    // Don't add users to work list.
-    return CombineTo(N, Result, false);
+    // Add users to worklist if AA is enabled, since it may introduce
+    // a lot of new chained token factors while removing memory deps.
+    bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
+      : DAG.getSubtarget().useAA();
+    return CombineTo(N, Result, UseAA /*add to worklist*/);
   }
 
   return Result;
@@ -1541,8 +1565,6 @@
 SDValue DAGCombiner::visitADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
 
   // fold vector ops
@@ -1563,6 +1585,8 @@
   if (N1.getOpcode() == ISD::UNDEF)
     return N1;
   // fold (add c1, c2) -> c1+c2
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::ADD, VT, N0C, N1C);
   // canonicalize constant to RHS
@@ -1714,8 +1738,6 @@
 SDValue DAGCombiner::visitADDC(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
 
   // If the flag result is dead, turn this into an ADD.
@@ -1725,6 +1747,8 @@
                                  SDLoc(N), MVT::Glue));
 
   // canonicalize constant to RHS.
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && !N1C)
     return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N1, N0);
 
@@ -1756,10 +1780,10 @@
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue CarryIn = N->getOperand(2);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
 
   // canonicalize constant to RHS
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && !N1C)
     return DAG.getNode(ISD::ADDE, SDLoc(N), N->getVTList(),
                        N1, N0, CarryIn);
@@ -1786,10 +1810,6 @@
 SDValue DAGCombiner::visitSUB(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getNode());
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
-  ConstantSDNode *N1C1 = N1.getOpcode() != ISD::ADD ? nullptr :
-    dyn_cast<ConstantSDNode>(N1.getOperand(1).getNode());
   EVT VT = N0.getValueType();
 
   // fold vector ops
@@ -1807,6 +1827,8 @@
   if (N0 == N1)
     return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes);
   // fold (sub c1, c2) -> c1-c2
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0.getNode());
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::SUB, VT, N0C, N1C);
   // fold (sub x, c) -> (add x, -c)
@@ -1826,6 +1848,8 @@
   if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1)
     return N0.getOperand(0);
   // fold C2-(A+C1) -> (C2-C1)-A
+  ConstantSDNode *N1C1 = N1.getOpcode() != ISD::ADD ? nullptr :
+    dyn_cast<ConstantSDNode>(N1.getOperand(1).getNode());
   if (N1.getOpcode() == ISD::ADD && N0C && N1C1) {
     SDValue NewC = DAG.getConstant(N0C->getAPIntValue() - N1C1->getAPIntValue(),
                                    VT);
@@ -1890,8 +1914,6 @@
 SDValue DAGCombiner::visitSUBC(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
 
   // If the flag result is dead, turn this into an SUB.
@@ -1907,6 +1929,8 @@
                                  MVT::Glue));
 
   // fold (subc x, 0) -> x + no borrow
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N1C && N1C->isNullValue())
     return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, SDLoc(N),
                                         MVT::Glue));
@@ -2055,8 +2079,6 @@
 SDValue DAGCombiner::visitSDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = isConstOrConstSplat(N0);
-  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   EVT VT = N->getValueType(0);
 
   // fold vector ops
@@ -2066,6 +2088,8 @@
   }
 
   // fold (sdiv c1, c2) -> c1/c2
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N0C && N1C && !N1C->isNullValue())
     return DAG.FoldConstantArithmetic(ISD::SDIV, VT, N0C, N1C);
   // fold (sdiv X, 1) -> X
@@ -2145,8 +2169,6 @@
 SDValue DAGCombiner::visitUDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = isConstOrConstSplat(N0);
-  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   EVT VT = N->getValueType(0);
 
   // fold vector ops
@@ -2156,6 +2178,8 @@
   }
 
   // fold (udiv c1, c2) -> c1/c2
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N0C && N1C && !N1C->isNullValue())
     return DAG.FoldConstantArithmetic(ISD::UDIV, VT, N0C, N1C);
   // fold (udiv x, (1 << c)) -> x >>u c
@@ -2197,11 +2221,11 @@
 SDValue DAGCombiner::visitSREM(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = isConstOrConstSplat(N0);
-  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   EVT VT = N->getValueType(0);
 
   // fold (srem c1, c2) -> c1%c2
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N0C && N1C && !N1C->isNullValue())
     return DAG.FoldConstantArithmetic(ISD::SREM, VT, N0C, N1C);
   // If we know the sign bits of both operands are zero, strength reduce to a
@@ -2239,11 +2263,11 @@
 SDValue DAGCombiner::visitUREM(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = isConstOrConstSplat(N0);
-  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   EVT VT = N->getValueType(0);
 
   // fold (urem c1, c2) -> c1%c2
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
   if (N0C && N1C && !N1C->isNullValue())
     return DAG.FoldConstantArithmetic(ISD::UREM, VT, N0C, N1C);
   // fold (urem x, pow2) -> (and x, pow2-1)
@@ -2522,6 +2546,7 @@
   // fold (OP (zext x), (zext y)) -> (zext (OP x, y))
   // fold (OP (sext x), (sext y)) -> (sext (OP x, y))
   // fold (OP (aext x), (aext y)) -> (aext (OP x, y))
+  // fold (OP (bswap x), (bswap y)) -> (bswap (OP x, y))
   // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free)
   //
   // do not sink logical op inside of a vector extend, since it may combine
@@ -2529,6 +2554,7 @@
   EVT Op0VT = N0.getOperand(0).getValueType();
   if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
        N0.getOpcode() == ISD::SIGN_EXTEND ||
+       N0.getOpcode() == ISD::BSWAP ||
        // Avoid infinite looping with PromoteIntBinOp.
        (N0.getOpcode() == ISD::ANY_EXTEND &&
         (!LegalTypes || TLI.isTypeDesirableForOp(N->getOpcode(), Op0VT))) ||
@@ -2662,11 +2688,7 @@
 SDValue DAGCombiner::visitAND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  SDValue LL, LR, RL, RR, CC0, CC1;
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N1.getValueType();
-  unsigned BitWidth = VT.getScalarType().getSizeInBits();
 
   // fold vector ops
   if (VT.isVector()) {
@@ -2698,6 +2720,8 @@
   if (N0.getOpcode() == ISD::UNDEF || N1.getOpcode() == ISD::UNDEF)
     return DAG.getConstant(0, VT);
   // fold (and c1, c2) -> c1&c2
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::AND, VT, N0C, N1C);
   // canonicalize constant to RHS
@@ -2707,6 +2731,7 @@
   if (N1C && N1C->isAllOnesValue())
     return N0;
   // if (and x, c) is known to be zero, return 0
+  unsigned BitWidth = VT.getScalarType().getSizeInBits();
   if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
                                    APInt::getAllOnesValue(BitWidth)))
     return DAG.getConstant(0, VT);
@@ -2793,6 +2818,7 @@
     // actually legal and isn't going to get expanded, else this is a false
     // optimisation.
     bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD,
+                                                    Load->getValueType(0),
                                                     Load->getMemoryVT());
 
     // Resize the constant to the same size as the original memory access before
@@ -2838,6 +2864,7 @@
     }
   }
   // fold (and (setcc x), (setcc y)) -> (setcc (and x, y))
+  SDValue LL, LR, RL, RR, CC0, CC1;
   if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){
     ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get();
     ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get();
@@ -2919,7 +2946,7 @@
     if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
                            BitWidth - MemVT.getScalarType().getSizeInBits())) &&
         ((!LegalOperations && !LN0->isVolatile()) ||
-         TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
+         TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
                                        LN0->getChain(), LN0->getBasePtr(),
                                        MemVT, LN0->getMemOperand());
@@ -2939,7 +2966,7 @@
     if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
                            BitWidth - MemVT.getScalarType().getSizeInBits())) &&
         ((!LegalOperations && !LN0->isVolatile()) ||
-         TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT))) {
+         TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
                                        LN0->getChain(), LN0->getBasePtr(),
                                        MemVT, LN0->getMemOperand());
@@ -2965,10 +2992,11 @@
       if (ActiveBits > 0 && APIntOps::isMask(ActiveBits, N1C->getAPIntValue())){
         EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
         EVT LoadedVT = LN0->getMemoryVT();
+        EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
 
         if (ExtVT == LoadedVT &&
-            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT))) {
-          EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
+            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy,
+                                                    ExtVT))) {
 
           SDValue NewLoad =
             DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy,
@@ -2983,7 +3011,8 @@
         // Do not generate loads of non-round integer types since these can
         // be expensive (and would be wrong if the type is not byte sized).
         if (!LN0->isVolatile() && LoadedVT.bitsGT(ExtVT) && ExtVT.isRound() &&
-            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, ExtVT))) {
+            (!LegalOperations || TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy,
+                                                    ExtVT))) {
           EVT PtrType = LN0->getOperand(1).getValueType();
 
           unsigned Alignment = LN0->getAlignment();
@@ -3003,7 +3032,6 @@
 
           AddToWorklist(NewPtr.getNode());
 
-          EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
           SDValue Load =
             DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy,
                            LN0->getChain(), NewPtr,
@@ -3313,9 +3341,6 @@
 SDValue DAGCombiner::visitOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  SDValue LL, LR, RL, RR, CC0, CC1;
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N1.getValueType();
 
   // fold vector ops
@@ -3407,6 +3432,8 @@
     return DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), VT);
   }
   // fold (or c1, c2) -> c1|c2
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::OR, VT, N0C, N1C);
   // canonicalize constant to RHS
@@ -3440,15 +3467,15 @@
              isa<ConstantSDNode>(N0.getOperand(1))) {
     ConstantSDNode *C1 = cast<ConstantSDNode>(N0.getOperand(1));
     if ((C1->getAPIntValue() & N1C->getAPIntValue()) != 0) {
-      SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, VT, N1C, C1);
-      if (!COR.getNode())
-        return SDValue();
-      return DAG.getNode(ISD::AND, SDLoc(N), VT,
-                         DAG.getNode(ISD::OR, SDLoc(N0), VT,
-                                     N0.getOperand(0), N1), COR);
+      if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, VT, N1C, C1))
+        return DAG.getNode(
+            ISD::AND, SDLoc(N), VT,
+            DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1), COR);
+      return SDValue();
     }
   }
   // fold (or (setcc x), (setcc y)) -> (setcc (or x, y))
+  SDValue LL, LR, RL, RR, CC0, CC1;
   if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){
     ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get();
     ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get();
@@ -3521,6 +3548,17 @@
     }
   }
 
+  // (or (and X, M), (and X, N)) -> (and X, (or M, N))
+  if (N0.getOpcode() == ISD::AND &&
+      N1.getOpcode() == ISD::AND &&
+      N0.getOperand(0) == N1.getOperand(0) &&
+      // Don't increase # computations.
+      (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
+    SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
+                            N0.getOperand(1), N1.getOperand(1));
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, N0.getOperand(0), X);
+  }
+
   // See if this is some rotate idiom.
   if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))
     return SDValue(Rot, 0);
@@ -3790,9 +3828,6 @@
 SDValue DAGCombiner::visitXOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  SDValue LHS, RHS, CC;
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
 
   // fold vector ops
@@ -3816,6 +3851,8 @@
   if (N1.getOpcode() == ISD::UNDEF)
     return N1;
   // fold (xor c1, c2) -> c1^c2
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::XOR, VT, N0C, N1C);
   // canonicalize constant to RHS
@@ -3830,6 +3867,7 @@
     return RXOR;
 
   // fold !(x cc y) -> (x !cc y)
+  SDValue LHS, RHS, CC;
   if (TLI.isConstTrueVal(N1.getNode()) && isSetCCEquivalent(N0, LHS, RHS, CC)) {
     bool isInt = LHS.getValueType().isInteger();
     ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
@@ -4039,12 +4077,11 @@
 SDValue DAGCombiner::visitSHL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarSizeInBits();
 
   // fold vector ops
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (VT.isVector()) {
     SDValue FoldedVOp = SimplifyVBinOp(N);
     if (FoldedVOp.getNode()) return FoldedVOp;
@@ -4061,8 +4098,7 @@
         if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC &&
             TLI.getBooleanContents(N00.getOperand(0).getValueType()) ==
                 TargetLowering::ZeroOrNegativeOneBooleanContent) {
-          SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, VT, N01CV, N1CV);
-          if (C.getNode())
+          if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, VT, N01CV, N1CV))
             return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C);
         }
       } else {
@@ -4072,6 +4108,7 @@
   }
 
   // fold (shl c1, c2) -> c1<<c2
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::SHL, VT, N0C, N1C);
   // fold (shl 0, x) -> 0
@@ -4220,12 +4257,11 @@
 SDValue DAGCombiner::visitSRA(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
   // fold vector ops
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (VT.isVector()) {
     SDValue FoldedVOp = SimplifyVBinOp(N);
     if (FoldedVOp.getNode()) return FoldedVOp;
@@ -4234,6 +4270,7 @@
   }
 
   // fold (sra c1, c2) -> (sra c1, c2)
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::SRA, VT, N0C, N1C);
   // fold (sra 0, x) -> 0
@@ -4366,12 +4403,11 @@
 SDValue DAGCombiner::visitSRL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   EVT VT = N0.getValueType();
   unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
 
   // fold vector ops
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (VT.isVector()) {
     SDValue FoldedVOp = SimplifyVBinOp(N);
     if (FoldedVOp.getNode()) return FoldedVOp;
@@ -4380,6 +4416,7 @@
   }
 
   // fold (srl c1, c2) -> c1 >>u c2
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   if (N0C && N1C)
     return DAG.FoldConstantArithmetic(ISD::SRL, VT, N0C, N1C);
   // fold (srl 0, x) -> 0
@@ -4608,13 +4645,47 @@
   return SDValue();
 }
 
+
+/// \brief Generate Min/Max node
+static SDValue combineMinNumMaxNum(SDLoc DL, EVT VT, SDValue LHS, SDValue RHS,
+                                   SDValue True, SDValue False,
+                                   ISD::CondCode CC, const TargetLowering &TLI,
+                                   SelectionDAG &DAG) {
+  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
+    return SDValue();
+
+  switch (CC) {
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETLT:
+  case ISD::SETLE:
+  case ISD::SETULT:
+  case ISD::SETULE: {
+    unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM;
+    if (TLI.isOperationLegal(Opcode, VT))
+      return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+    return SDValue();
+  }
+  case ISD::SETOGT:
+  case ISD::SETOGE:
+  case ISD::SETGT:
+  case ISD::SETGE:
+  case ISD::SETUGT:
+  case ISD::SETUGE: {
+    unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM;
+    if (TLI.isOperationLegal(Opcode, VT))
+      return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+    return SDValue();
+  }
+  default:
+    return SDValue();
+  }
+}
+
 SDValue DAGCombiner::visitSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue N2 = N->getOperand(2);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
-  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
-  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
   EVT VT = N->getValueType(0);
   EVT VT0 = N0.getValueType();
 
@@ -4622,12 +4693,14 @@
   if (N1 == N2)
     return N1;
   // fold (select true, X, Y) -> X
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   if (N0C && !N0C->isNullValue())
     return N1;
   // fold (select false, X, Y) -> Y
   if (N0C && N0C->isNullValue())
     return N2;
   // fold (select C, 1, X) -> (or C, X)
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (VT == MVT::i1 && N1C && N1C->getAPIntValue() == 1)
     return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N2);
   // fold (select C, 0, 1) -> (xor C, 1)
@@ -4639,6 +4712,7 @@
   // undiscoverable (or not reasonably discoverable). For example, it could be
   // in another basic block or it could require searching a complicated
   // expression.
+  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
   if (VT.isInteger() &&
       (VT0 == MVT::i1 || (VT0.isInteger() &&
                           TLI.getBooleanContents(false, false) ==
@@ -4687,6 +4761,28 @@
 
   // fold selects based on a setcc into other things, such as min/max/abs
   if (N0.getOpcode() == ISD::SETCC) {
+    // select x, y (fcmp lt x, y) -> fminnum x, y
+    // select x, y (fcmp gt x, y) -> fmaxnum x, y
+    //
+    // This is OK if we don't care about what happens if either operand is a
+    // NaN.
+    //
+
+    // FIXME: Instead of testing for UnsafeFPMath, this should be checking for
+    // no signed zeros as well as no nans.
+    const TargetOptions &Options = DAG.getTarget().Options;
+    if (Options.UnsafeFPMath &&
+        VT.isFloatingPoint() && N0.hasOneUse() &&
+        DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+
+      SDValue FMinMax =
+          combineMinNumMaxNum(SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1),
+                              N1, N2, CC, TLI, DAG);
+      if (FMinMax)
+        return FMinMax;
+    }
+
     if ((!LegalOperations &&
          TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) ||
         TLI.isOperationLegal(ISD::SELECT_CC, VT))
@@ -4771,6 +4867,166 @@
       TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1));
 }
 
+SDValue DAGCombiner::visitMSTORE(SDNode *N) {
+
+  if (Level >= AfterLegalizeTypes)
+    return SDValue();
+
+  MaskedStoreSDNode *MST = dyn_cast<MaskedStoreSDNode>(N);
+  SDValue Mask = MST->getMask();
+  SDValue Data  = MST->getValue();
+  SDLoc DL(N);
+
+  // If the MSTORE data type requires splitting and the mask is provided by a
+  // SETCC, then split both nodes and its operands before legalization. This
+  // prevents the type legalizer from unrolling SETCC into scalar comparisons
+  // and enables future optimizations (e.g. min/max pattern matching on X86).
+  if (Mask.getOpcode() == ISD::SETCC) {
+
+    // Check if any splitting is required.
+    if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) !=
+        TargetLowering::TypeSplitVector)
+      return SDValue();
+
+    SDValue MaskLo, MaskHi, Lo, Hi;
+    std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
+
+    EVT LoVT, HiVT;
+    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MST->getValueType(0));
+
+    SDValue Chain = MST->getChain();
+    SDValue Ptr   = MST->getBasePtr();
+
+    EVT MemoryVT = MST->getMemoryVT();
+    unsigned Alignment = MST->getOriginalAlignment();
+
+    // if Alignment is equal to the vector size,
+    // take the half of it for the second part
+    unsigned SecondHalfAlignment =
+      (Alignment == Data->getValueType(0).getSizeInBits()/8) ?
+         Alignment/2 : Alignment;
+
+    EVT LoMemVT, HiMemVT;
+    std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+    SDValue DataLo, DataHi;
+    std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
+
+    MachineMemOperand *MMO = DAG.getMachineFunction().
+      getMachineMemOperand(MST->getPointerInfo(),
+                           MachineMemOperand::MOStore,  LoMemVT.getStoreSize(),
+                           Alignment, MST->getAAInfo(), MST->getRanges());
+
+    Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
+                            MST->isTruncatingStore());
+
+    unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
+
+    MMO = DAG.getMachineFunction().
+      getMachineMemOperand(MST->getPointerInfo(),
+                           MachineMemOperand::MOStore,  HiMemVT.getStoreSize(),
+                           SecondHalfAlignment, MST->getAAInfo(),
+                           MST->getRanges());
+
+    Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
+                            MST->isTruncatingStore());
+
+    AddToWorklist(Lo.getNode());
+    AddToWorklist(Hi.getNode());
+
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMLOAD(SDNode *N) {
+
+  if (Level >= AfterLegalizeTypes)
+    return SDValue();
+
+  MaskedLoadSDNode *MLD = dyn_cast<MaskedLoadSDNode>(N);
+  SDValue Mask = MLD->getMask();
+  SDLoc DL(N);
+
+  // If the MLOAD result requires splitting and the mask is provided by a
+  // SETCC, then split both nodes and its operands before legalization. This
+  // prevents the type legalizer from unrolling SETCC into scalar comparisons
+  // and enables future optimizations (e.g. min/max pattern matching on X86).
+
+  if (Mask.getOpcode() == ISD::SETCC) {
+    EVT VT = N->getValueType(0);
+
+    // Check if any splitting is required.
+    if (TLI.getTypeAction(*DAG.getContext(), VT) !=
+        TargetLowering::TypeSplitVector)
+      return SDValue();
+
+    SDValue MaskLo, MaskHi, Lo, Hi;
+    std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
+
+    SDValue Src0 = MLD->getSrc0();
+    SDValue Src0Lo, Src0Hi;
+    std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL);
+
+    EVT LoVT, HiVT;
+    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
+
+    SDValue Chain = MLD->getChain();
+    SDValue Ptr   = MLD->getBasePtr();
+    EVT MemoryVT = MLD->getMemoryVT();
+    unsigned Alignment = MLD->getOriginalAlignment();
+
+    // if Alignment is equal to the vector size,
+    // take the half of it for the second part
+    unsigned SecondHalfAlignment =
+      (Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
+         Alignment/2 : Alignment;
+
+    EVT LoMemVT, HiMemVT;
+    std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+    MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MLD->getPointerInfo(),
+                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
+                         Alignment, MLD->getAAInfo(), MLD->getRanges());
+
+    Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, LoMemVT, MMO,
+                           ISD::NON_EXTLOAD);
+
+    unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, Ptr.getValueType()));
+
+    MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MLD->getPointerInfo(),
+                         MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
+                         SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());
+
+    Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, HiMemVT, MMO,
+                           ISD::NON_EXTLOAD);
+
+    AddToWorklist(Lo.getNode());
+    AddToWorklist(Hi.getNode());
+
+    // Build a factor node to remember that this load is independent of the
+    // other one.
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
+                        Hi.getValue(1));
+
+    // Legalized the chain result - switch anything that used the old chain to
+    // use the new one.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), Chain);
+
+    SDValue LoadRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+
+    SDValue RetOps[] = { LoadRes, Chain };
+    return DAG.getMergeValues(RetOps, DL);
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitVSELECT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -4880,13 +5136,16 @@
         return N2;    // cond always true -> true val
       else
         return N3;    // cond always false -> false val
-    }
-
-    // Fold to a simpler select_cc
-    if (SCC.getOpcode() == ISD::SETCC)
+    } else if (SCC->getOpcode() == ISD::UNDEF) {
+      // When the condition is UNDEF, just return the first operand. This is
+      // coherent the DAG creation, no setcc node is created in this case
+      return N2;
+    } else if (SCC.getOpcode() == ISD::SETCC) {
+      // Fold to a simpler select_cc
       return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N2.getValueType(),
                          SCC.getOperand(0), SCC.getOperand(1), N2, N3,
                          SCC.getOperand(2));
+    }
   }
 
   // If we can fold this based on the true/false value, do so.
@@ -5047,6 +5306,102 @@
   }
 }
 
+// FIXME: Bring more similar combines here, common to sext/zext (maybe aext?).
+SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT DstVT = N->getValueType(0);
+  EVT SrcVT = N0.getValueType();
+
+  assert((N->getOpcode() == ISD::SIGN_EXTEND ||
+          N->getOpcode() == ISD::ZERO_EXTEND) &&
+         "Unexpected node type (not an extend)!");
+
+  // fold (sext (load x)) to multiple smaller sextloads; same for zext.
+  // For example, on a target with legal v4i32, but illegal v8i32, turn:
+  //   (v8i32 (sext (v8i16 (load x))))
+  // into:
+  //   (v8i32 (concat_vectors (v4i32 (sextload x)),
+  //                          (v4i32 (sextload (x + 16)))))
+  // Where uses of the original load, i.e.:
+  //   (v8i16 (load x))
+  // are replaced with:
+  //   (v8i16 (truncate
+  //     (v8i32 (concat_vectors (v4i32 (sextload x)),
+  //                            (v4i32 (sextload (x + 16)))))))
+  //
+  // This combine is only applicable to illegal, but splittable, vectors.
+  // All legal types, and illegal non-vector types, are handled elsewhere.
+  // This combine is controlled by TargetLowering::isVectorLoadExtDesirable.
+  //
+  if (N0->getOpcode() != ISD::LOAD)
+    return SDValue();
+
+  LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+
+  if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) ||
+      !N0.hasOneUse() || LN0->isVolatile() || !DstVT.isVector() ||
+      !DstVT.isPow2VectorType() || !TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
+    return SDValue();
+
+  SmallVector<SDNode *, 4> SetCCs;
+  if (!ExtendUsesToFormExtLoad(N, N0, N->getOpcode(), SetCCs, TLI))
+    return SDValue();
+
+  ISD::LoadExtType ExtType =
+      N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+
+  // Try to split the vector types to get down to legal types.
+  EVT SplitSrcVT = SrcVT;
+  EVT SplitDstVT = DstVT;
+  while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) &&
+         SplitSrcVT.getVectorNumElements() > 1) {
+    SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first;
+    SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first;
+  }
+
+  if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT))
+    return SDValue();
+
+  SDLoc DL(N);
+  const unsigned NumSplits =
+      DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements();
+  const unsigned Stride = SplitSrcVT.getStoreSize();
+  SmallVector<SDValue, 4> Loads;
+  SmallVector<SDValue, 4> Chains;
+
+  SDValue BasePtr = LN0->getBasePtr();
+  for (unsigned Idx = 0; Idx < NumSplits; Idx++) {
+    const unsigned Offset = Idx * Stride;
+    const unsigned Align = MinAlign(LN0->getAlignment(), Offset);
+
+    SDValue SplitLoad = DAG.getExtLoad(
+        ExtType, DL, SplitDstVT, LN0->getChain(), BasePtr,
+        LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT,
+        LN0->isVolatile(), LN0->isNonTemporal(), LN0->isInvariant(),
+        Align, LN0->getAAInfo());
+
+    BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
+                          DAG.getConstant(Stride, BasePtr.getValueType()));
+
+    Loads.push_back(SplitLoad.getValue(0));
+    Chains.push_back(SplitLoad.getValue(1));
+  }
+
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads);
+
+  CombineTo(N, NewValue);
+
+  // Replace uses of the original load (before extension)
+  // with a truncate of the concatenated sextloaded vectors.
+  SDValue Trunc =
+      DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue);
+  CombineTo(N0.getNode(), Trunc, NewChain);
+  ExtendSetCCUses(SetCCs, Trunc, NewValue, DL,
+                  (ISD::NodeType)N->getOpcode());
+  return SDValue(N, 0); // Return N so it doesn't get rechecked!
+}
+
 SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -5113,17 +5468,18 @@
   }
 
   // fold (sext (load x)) -> (sext (truncate (sextload x)))
-  // None of the supported targets knows how to perform load and sign extend
-  // on vectors in one instruction.  We only perform this transformation on
-  // scalars.
-  if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
-      ISD::isUNINDEXEDLoad(N0.getNode()) &&
-      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
-       TLI.isLoadExtLegal(ISD::SEXTLOAD, N0.getValueType()))) {
+  // Only generate vector extloads when 1) they're legal, and 2) they are
+  // deemed desirable by the target.
+  if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      ((!LegalOperations && !VT.isVector() &&
+        !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()))) {
     bool DoXform = true;
     SmallVector<SDNode*, 4> SetCCs;
     if (!N0.hasOneUse())
       DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI);
+    if (VT.isVector())
+      DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
       SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
@@ -5140,6 +5496,11 @@
     }
   }
 
+  // fold (sext (load x)) to multiple smaller sextloads.
+  // Only on illegal but splittable vectors.
+  if (SDValue ExtLoad = CombineExtLoad(N))
+    return ExtLoad;
+
   // fold (sext (sextload x)) -> (sext (truncate (sextload x)))
   // fold (sext ( extload x)) -> (sext (truncate (sextload x)))
   if ((ISD::isSEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) &&
@@ -5147,7 +5508,7 @@
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     EVT MemVT = LN0->getMemoryVT();
     if ((!LegalOperations && !LN0->isVolatile()) ||
-        TLI.isLoadExtLegal(ISD::SEXTLOAD, MemVT)) {
+        TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT)) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
                                        LN0->getBasePtr(), MemVT,
@@ -5167,7 +5528,7 @@
        N0.getOpcode() == ISD::XOR) &&
       isa<LoadSDNode>(N0.getOperand(0)) &&
       N0.getOperand(1).getOpcode() == ISD::Constant &&
-      TLI.isLoadExtLegal(ISD::SEXTLOAD, N0.getValueType()) &&
+      TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()) &&
       (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
     if (LN0->getExtensionType() != ISD::ZEXTLOAD && LN0->isUnindexed()) {
@@ -5403,17 +5764,18 @@
   }
 
   // fold (zext (load x)) -> (zext (truncate (zextload x)))
-  // None of the supported targets knows how to perform load and vector_zext
-  // on vectors in one instruction.  We only perform this transformation on
-  // scalars.
-  if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
-      ISD::isUNINDEXEDLoad(N0.getNode()) &&
-      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
-       TLI.isLoadExtLegal(ISD::ZEXTLOAD, N0.getValueType()))) {
+  // Only generate vector extloads when 1) they're legal, and 2) they are
+  // deemed desirable by the target.
+  if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      ((!LegalOperations && !VT.isVector() &&
+        !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()))) {
     bool DoXform = true;
     SmallVector<SDNode*, 4> SetCCs;
     if (!N0.hasOneUse())
       DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI);
+    if (VT.isVector())
+      DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
@@ -5431,13 +5793,18 @@
     }
   }
 
+  // fold (zext (load x)) to multiple smaller zextloads.
+  // Only on illegal but splittable vectors.
+  if (SDValue ExtLoad = CombineExtLoad(N))
+    return ExtLoad;
+
   // fold (zext (and/or/xor (load x), cst)) ->
   //      (and/or/xor (zextload x), (zext cst))
   if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
        N0.getOpcode() == ISD::XOR) &&
       isa<LoadSDNode>(N0.getOperand(0)) &&
       N0.getOperand(1).getOpcode() == ISD::Constant &&
-      TLI.isLoadExtLegal(ISD::ZEXTLOAD, N0.getValueType()) &&
+      TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()) &&
       (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
     if (LN0->getExtensionType() != ISD::SEXTLOAD && LN0->isUnindexed()) {
@@ -5474,7 +5841,7 @@
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     EVT MemVT = LN0->getMemoryVT();
     if ((!LegalOperations && !LN0->isVolatile()) ||
-        TLI.isLoadExtLegal(ISD::ZEXTLOAD, MemVT)) {
+        TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT)) {
       SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
                                        LN0->getChain(),
                                        LN0->getBasePtr(), MemVT,
@@ -5636,7 +6003,7 @@
   // scalars.
   if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
       ISD::isUNINDEXEDLoad(N0.getNode()) &&
-      TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType())) {
+      TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
     bool DoXform = true;
     SmallVector<SDNode*, 4> SetCCs;
     if (!N0.hasOneUse())
@@ -5666,7 +6033,7 @@
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     ISD::LoadExtType ExtType = LN0->getExtensionType();
     EVT MemVT = LN0->getMemoryVT();
-    if (!LegalOperations || TLI.isLoadExtLegal(ExtType, MemVT)) {
+    if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) {
       SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N),
                                        VT, LN0->getChain(), LN0->getBasePtr(),
                                        MemVT, LN0->getMemOperand());
@@ -5795,7 +6162,7 @@
     ExtVT = EVT::getIntegerVT(*DAG.getContext(),
                               VT.getSizeInBits() - N01->getZExtValue());
   }
-  if (LegalOperations && !TLI.isLoadExtLegal(ExtType, ExtVT))
+  if (LegalOperations && !TLI.isLoadExtLegal(ExtType, VT, ExtVT))
     return SDValue();
 
   unsigned EVTBits = ExtVT.getSizeInBits();
@@ -5874,6 +6241,9 @@
       LN0->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt)
     return SDValue();
 
+  if (!TLI.shouldReduceLoadWidth(LN0, ExtType, ExtVT))
+    return SDValue();
+
   EVT PtrType = N0.getOperand(1).getValueType();
 
   if (PtrType == MVT::Untyped || PtrType.isExtended())
@@ -5999,7 +6369,7 @@
       ISD::isUNINDEXEDLoad(N0.getNode()) &&
       EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
-       TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) {
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
@@ -6015,7 +6385,7 @@
       N0.hasOneUse() &&
       EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
       ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
-       TLI.isLoadExtLegal(ISD::SEXTLOAD, EVT))) {
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
@@ -6318,19 +6688,15 @@
 
   // If the input is a constant, let getNode fold it.
   if (isa<ConstantSDNode>(N0) || isa<ConstantFPSDNode>(N0)) {
-    SDValue Res = DAG.getNode(ISD::BITCAST, SDLoc(N), VT, N0);
-    if (Res.getNode() != N) {
-      if (!LegalOperations ||
-          TLI.isOperationLegal(Res.getNode()->getOpcode(), VT))
-        return Res;
-
-      // Folding it resulted in an illegal node, and it's too late to
-      // do that. Clean up the old node and forego the transformation.
-      // Ideally this won't happen very often, because instcombine
-      // and the earlier dagcombine runs (where illegal nodes are
-      // permitted) should have folded most of them already.
-      deleteAndRecombine(Res.getNode());
-    }
+    // If we can't allow illegal operations, we need to check that this is just
+    // a fp -> int or int -> conversion and that the resulting operation will
+    // be legal.
+    if (!LegalOperations ||
+        (isa<ConstantSDNode>(N0) && VT.isFloatingPoint() && !VT.isVector() &&
+         TLI.isOperationLegal(ISD::ConstantFP, VT)) ||
+        (isa<ConstantFPSDNode>(N0) && VT.isInteger() && !VT.isVector() &&
+         TLI.isOperationLegal(ISD::Constant, VT)))
+      return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, N0);
   }
 
   // (conv (conv x, t1), t2) -> (conv x, t2)
@@ -6489,7 +6855,6 @@
   if (SrcEltVT.isFloatingPoint()) {
     // Convert the input float vector to a int vector where the elements are the
     // same sizes.
-    assert((SrcEltVT == MVT::f32 || SrcEltVT == MVT::f64) && "Unknown FP VT!");
     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits());
     BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode();
     SrcEltVT = IntVT;
@@ -6498,7 +6863,6 @@
   // Now we know the input is an integer vector.  If the output is a FP type,
   // convert to integer first, then to FP of the right size.
   if (DstEltVT.isFloatingPoint()) {
-    assert((DstEltVT == MVT::f32 || DstEltVT == MVT::f64) && "Unknown FP VT!");
     EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits());
     SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode();
 
@@ -6549,8 +6913,7 @@
 
   for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
     if (BV->getOperand(i).getOpcode() == ISD::UNDEF) {
-      for (unsigned j = 0; j != NumOutputsPerInput; ++j)
-        Ops.push_back(DAG.getUNDEF(DstEltVT));
+      Ops.append(NumOutputsPerInput, DAG.getUNDEF(DstEltVT));
       continue;
     }
 
@@ -6575,6 +6938,133 @@
   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(BV), VT, Ops);
 }
 
+// Attempt different variants of (fadd (fmul a, b), c) -> fma or fmad
+static SDValue performFaddFmulCombines(unsigned FusedOpcode,
+                                       bool Aggressive,
+                                       SDNode *N,
+                                       const TargetLowering &TLI,
+                                       SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  // fold (fadd (fmul x, y), z) -> (fma x, y, z)
+  if (N0.getOpcode() == ISD::FMUL &&
+      (Aggressive || N0->hasOneUse())) {
+    return DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                       N0.getOperand(0), N0.getOperand(1), N1);
+  }
+
+  // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
+  // Note: Commutes FADD operands.
+  if (N1.getOpcode() == ISD::FMUL &&
+      (Aggressive || N1->hasOneUse())) {
+    return DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                       N1.getOperand(0), N1.getOperand(1), N0);
+  }
+
+  // More folding opportunities when target permits.
+  if (Aggressive) {
+    // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z))
+    if (N0.getOpcode() == ISD::FMA &&
+        N0.getOperand(2).getOpcode() == ISD::FMUL) {
+      return DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                         N0.getOperand(0), N0.getOperand(1),
+                         DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                                     N0.getOperand(2).getOperand(0),
+                                     N0.getOperand(2).getOperand(1),
+                                     N1));
+    }
+
+    // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x))
+    if (N1->getOpcode() == ISD::FMA &&
+        N1.getOperand(2).getOpcode() == ISD::FMUL) {
+      return DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                         N1.getOperand(0), N1.getOperand(1),
+                         DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                                     N1.getOperand(2).getOperand(0),
+                                     N1.getOperand(2).getOperand(1),
+                                     N0));
+    }
+  }
+
+  return SDValue();
+}
+
+static SDValue performFsubFmulCombines(unsigned FusedOpcode,
+                                       bool Aggressive,
+                                       SDNode *N,
+                                       const TargetLowering &TLI,
+                                       SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  SDLoc SL(N);
+
+  // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+  if (N0.getOpcode() == ISD::FMUL &&
+      (Aggressive || N0->hasOneUse())) {
+    return DAG.getNode(FusedOpcode, SL, VT,
+                       N0.getOperand(0), N0.getOperand(1),
+                       DAG.getNode(ISD::FNEG, SL, VT, N1));
+  }
+
+  // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+  // Note: Commutes FSUB operands.
+  if (N1.getOpcode() == ISD::FMUL &&
+      (Aggressive || N1->hasOneUse()))
+    return DAG.getNode(FusedOpcode, SL, VT,
+                       DAG.getNode(ISD::FNEG, SL, VT,
+                                   N1.getOperand(0)),
+                       N1.getOperand(1), N0);
+
+  // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
+  if (N0.getOpcode() == ISD::FNEG &&
+      N0.getOperand(0).getOpcode() == ISD::FMUL &&
+      (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) {
+    SDValue N00 = N0.getOperand(0).getOperand(0);
+    SDValue N01 = N0.getOperand(0).getOperand(1);
+    return DAG.getNode(FusedOpcode, SL, VT,
+                       DAG.getNode(ISD::FNEG, SL, VT, N00), N01,
+                       DAG.getNode(ISD::FNEG, SL, VT, N1));
+  }
+
+  // More folding opportunities when target permits.
+  if (Aggressive) {
+    // fold (fsub (fma x, y, (fmul u, v)), z)
+    //   -> (fma x, y (fma u, v, (fneg z)))
+    if (N0.getOpcode() == FusedOpcode &&
+        N0.getOperand(2).getOpcode() == ISD::FMUL) {
+      return DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                         N0.getOperand(0), N0.getOperand(1),
+                         DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                                     N0.getOperand(2).getOperand(0),
+                                     N0.getOperand(2).getOperand(1),
+                                     DAG.getNode(ISD::FNEG, SDLoc(N), VT,
+                                                 N1)));
+    }
+
+    // fold (fsub x, (fma y, z, (fmul u, v)))
+    //   -> (fma (fneg y), z, (fma (fneg u), v, x))
+    if (N1.getOpcode() == FusedOpcode &&
+        N1.getOperand(2).getOpcode() == ISD::FMUL) {
+      SDValue N20 = N1.getOperand(2).getOperand(0);
+      SDValue N21 = N1.getOperand(2).getOperand(1);
+      return DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                         DAG.getNode(ISD::FNEG, SDLoc(N), VT,
+                                     N1.getOperand(0)),
+                         N1.getOperand(1),
+                         DAG.getNode(FusedOpcode, SDLoc(N), VT,
+                                     DAG.getNode(ISD::FNEG, SDLoc(N),  VT,
+                                                 N20),
+                                     N21, N0));
+    }
+  }
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitFADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -6714,23 +7204,55 @@
     }
   } // enable-unsafe-fp-math
 
+  if (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)) {
+    // Assume if there is an fmad instruction that it should be aggressively
+    // used.
+    if (SDValue Fused = performFaddFmulCombines(ISD::FMAD, true, N, TLI, DAG))
+      return Fused;
+  }
+
   // FADD -> FMA combines:
   if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) &&
       TLI.isFMAFasterThanFMulAndFAdd(VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) {
 
-    // fold (fadd (fmul x, y), z) -> (fma x, y, z)
-    if (N0.getOpcode() == ISD::FMUL &&
-        (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
-      return DAG.getNode(ISD::FMA, SDLoc(N), VT,
-                         N0.getOperand(0), N0.getOperand(1), N1);
+    if (!TLI.isOperationLegal(ISD::FMAD, VT)) {
+      // Don't form FMA if we are preferring FMAD.
+      if (SDValue Fused
+          = performFaddFmulCombines(ISD::FMA,
+                                    TLI.enableAggressiveFMAFusion(VT),
+                                    N, TLI, DAG)) {
+        return Fused;
+      }
+    }
 
-    // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
-    // Note: Commutes FADD operands.
-    if (N1.getOpcode() == ISD::FMUL &&
-        (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
-      return DAG.getNode(ISD::FMA, SDLoc(N), VT,
-                         N1.getOperand(0), N1.getOperand(1), N0);
+    // When FP_EXTEND nodes are free on the target, and there is an opportunity
+    // to combine into FMA, arrange such nodes accordingly.
+    if (TLI.isFPExtFree(VT)) {
+
+      // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
+      if (N0.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N00 = N0.getOperand(0);
+        if (N00.getOpcode() == ISD::FMUL)
+          return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N00.getOperand(0)),
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N00.getOperand(1)), N1);
+      }
+
+      // fold (fadd x, (fpext (fmul y, z)), z) -> (fma (fpext y), (fpext z), x)
+      // Note: Commutes FADD operands.
+      if (N1.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N10 = N1.getOperand(0);
+        if (N10.getOpcode() == ISD::FMUL)
+          return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N10.getOperand(0)),
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N10.getOperand(1)), N0);
+      }
+    }
   }
 
   return SDValue();
@@ -6792,37 +7314,95 @@
     }
   }
 
+  if (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT)) {
+    // Assume if there is an fmad instruction that it should be aggressively
+    // used.
+    if (SDValue Fused = performFsubFmulCombines(ISD::FMAD, true, N, TLI, DAG))
+      return Fused;
+  }
+
   // FSUB -> FMA combines:
   if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath) &&
       TLI.isFMAFasterThanFMulAndFAdd(VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT))) {
 
-    // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
-    if (N0.getOpcode() == ISD::FMUL &&
-        (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
-      return DAG.getNode(ISD::FMA, dl, VT,
-                         N0.getOperand(0), N0.getOperand(1),
-                         DAG.getNode(ISD::FNEG, dl, VT, N1));
+    if (!TLI.isOperationLegal(ISD::FMAD, VT)) {
+      // Don't form FMA if we are preferring FMAD.
 
-    // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
-    // Note: Commutes FSUB operands.
-    if (N1.getOpcode() == ISD::FMUL &&
-        (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(VT)))
-      return DAG.getNode(ISD::FMA, dl, VT,
-                         DAG.getNode(ISD::FNEG, dl, VT,
-                         N1.getOperand(0)),
-                         N1.getOperand(1), N0);
+      if (SDValue Fused
+          = performFsubFmulCombines(ISD::FMA,
+                                    TLI.enableAggressiveFMAFusion(VT),
+                                    N, TLI, DAG)) {
+        return Fused;
+      }
+    }
 
-    // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
-    if (N0.getOpcode() == ISD::FNEG &&
-        N0.getOperand(0).getOpcode() == ISD::FMUL &&
-        ((N0->hasOneUse() && N0.getOperand(0).hasOneUse()) ||
-            TLI.enableAggressiveFMAFusion(VT))) {
-      SDValue N00 = N0.getOperand(0).getOperand(0);
-      SDValue N01 = N0.getOperand(0).getOperand(1);
-      return DAG.getNode(ISD::FMA, dl, VT,
-                         DAG.getNode(ISD::FNEG, dl, VT, N00), N01,
-                         DAG.getNode(ISD::FNEG, dl, VT, N1));
+    // When FP_EXTEND nodes are free on the target, and there is an opportunity
+    // to combine into FMA, arrange such nodes accordingly.
+    if (TLI.isFPExtFree(VT)) {
+      // fold (fsub (fpext (fmul x, y)), z)
+      //   -> (fma (fpext x), (fpext y), (fneg z))
+      if (N0.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N00 = N0.getOperand(0);
+        if (N00.getOpcode() == ISD::FMUL)
+          return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N00.getOperand(0)),
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N00.getOperand(1)),
+                             DAG.getNode(ISD::FNEG, SDLoc(N), VT, N1));
+      }
+
+      // fold (fsub x, (fpext (fmul y, z)))
+      //   -> (fma (fneg (fpext y)), (fpext z), x)
+      // Note: Commutes FSUB operands.
+      if (N1.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N10 = N1.getOperand(0);
+        if (N10.getOpcode() == ISD::FMUL)
+          return DAG.getNode(ISD::FMA, SDLoc(N), VT,
+                             DAG.getNode(ISD::FNEG, SDLoc(N), VT,
+                                         DAG.getNode(ISD::FP_EXTEND, SDLoc(N),
+                                                     VT, N10.getOperand(0))),
+                             DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                         N10.getOperand(1)),
+                             N0);
+      }
+
+      // fold (fsub (fpext (fneg (fmul, x, y))), z)
+      //   -> (fma (fneg (fpext x)), (fpext y), (fneg z))
+      if (N0.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N00 = N0.getOperand(0);
+        if (N00.getOpcode() == ISD::FNEG) {
+          SDValue N000 = N00.getOperand(0);
+          if (N000.getOpcode() == ISD::FMUL) {
+            return DAG.getNode(ISD::FMA, dl, VT,
+                               DAG.getNode(ISD::FNEG, dl, VT,
+                                           DAG.getNode(ISD::FP_EXTEND, SDLoc(N),
+                                                       VT, N000.getOperand(0))),
+                               DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                           N000.getOperand(1)),
+                               DAG.getNode(ISD::FNEG, dl, VT, N1));
+          }
+        }
+      }
+
+      // fold (fsub (fneg (fpext (fmul, x, y))), z)
+      //   -> (fma (fneg (fpext x)), (fpext y), (fneg z))
+      if (N0.getOpcode() == ISD::FNEG) {
+        SDValue N00 = N0.getOperand(0);
+        if (N00.getOpcode() == ISD::FP_EXTEND) {
+          SDValue N000 = N00.getOperand(0);
+          if (N000.getOpcode() == ISD::FMUL) {
+            return DAG.getNode(ISD::FMA, dl, VT,
+                               DAG.getNode(ISD::FNEG, dl, VT,
+                                           DAG.getNode(ISD::FP_EXTEND, SDLoc(N),
+                                           VT, N000.getOperand(0))),
+                               DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT,
+                                           N000.getOperand(1)),
+                               DAG.getNode(ISD::FNEG, dl, VT, N1));
+          }
+        }
+      }
     }
   }
 
@@ -7104,6 +7684,44 @@
     }
   }
 
+  // Combine multiple FDIVs with the same divisor into multiple FMULs by the
+  // reciprocal.
+  // E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
+  // Notice that this is not always beneficial. One reason is different target
+  // may have different costs for FDIV and FMUL, so sometimes the cost of two
+  // FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason
+  // is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
+  if (Options.UnsafeFPMath) {
+    // Skip if current node is a reciprocal.
+    if (N0CFP && N0CFP->isExactlyValue(1.0))
+      return SDValue();
+
+    SmallVector<SDNode *, 4> Users;
+    // Find all FDIV users of the same divisor.
+    for (SDNode::use_iterator UI = N1.getNode()->use_begin(),
+                              UE = N1.getNode()->use_end();
+         UI != UE; ++UI) {
+      SDNode *User = UI.getUse().getUser();
+      if (User->getOpcode() == ISD::FDIV && User->getOperand(1) == N1)
+        Users.push_back(User);
+    }
+
+    if (TLI.combineRepeatedFPDivisors(Users.size())) {
+      SDValue FPOne = DAG.getConstantFP(1.0, VT); // floating point 1.0
+      SDValue Reciprocal = DAG.getNode(ISD::FDIV, SDLoc(N), VT, FPOne, N1);
+
+      // Dividend / Divisor -> Dividend * Reciprocal
+      for (auto I = Users.begin(), E = Users.end(); I != E; ++I) {
+        if ((*I)->getOperand(0) != FPOne) {
+          SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(*I), VT,
+                                        (*I)->getOperand(0), Reciprocal);
+          DAG.ReplaceAllUsesWith(*I, NewNode.getNode());
+        }
+      }
+      return SDValue();
+    }
+  }
+
   return SDValue();
 }
 
@@ -7122,7 +7740,8 @@
 }
 
 SDValue DAGCombiner::visitFSQRT(SDNode *N) {
-  if (DAG.getTarget().Options.UnsafeFPMath) {
+  if (DAG.getTarget().Options.UnsafeFPMath &&
+      !TLI.isFsqrtCheap()) {
     // Compute this as X * (1/sqrt(X)) = X * (X ** -0.5)
     if (SDValue RV = BuildRsqrtEstimate(N->getOperand(0))) {
       EVT VT = RV.getValueType();
@@ -7198,11 +7817,11 @@
 
 SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   EVT VT = N->getValueType(0);
   EVT OpVT = N0.getValueType();
 
   // fold (sint_to_fp c1) -> c1fp
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   if (N0C &&
       // ...but only if the target supports immediate floating-point values
       (!LegalOperations ||
@@ -7251,11 +7870,11 @@
 
 SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
-  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   EVT VT = N->getValueType(0);
   EVT OpVT = N0.getValueType();
 
   // fold (uint_to_fp c1) -> c1fp
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   if (N0C &&
       // ...but only if the target supports immediate floating-point values
       (!LegalOperations ||
@@ -7289,6 +7908,50 @@
   return SDValue();
 }
 
+// Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x
+static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (N0.getOpcode() != ISD::UINT_TO_FP && N0.getOpcode() != ISD::SINT_TO_FP)
+    return SDValue();
+
+  SDValue Src = N0.getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  bool IsInputSigned = N0.getOpcode() == ISD::SINT_TO_FP;
+  bool IsOutputSigned = N->getOpcode() == ISD::FP_TO_SINT;
+
+  // We can safely assume the conversion won't overflow the output range,
+  // because (for example) (uint8_t)18293.f is undefined behavior.
+
+  // Since we can assume the conversion won't overflow, our decision as to
+  // whether the input will fit in the float should depend on the minimum
+  // of the input range and output range.
+
+  // This means this is also safe for a signed input and unsigned output, since
+  // a negative input would lead to undefined behavior.
+  unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned;
+  unsigned OutputSize = (int)VT.getScalarSizeInBits() - IsOutputSigned;
+  unsigned ActualSize = std::min(InputSize, OutputSize);
+  const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType());
+
+  // We can only fold away the float conversion if the input range can be
+  // represented exactly in the float range.
+  if (APFloat::semanticsPrecision(sem) >= ActualSize) {
+    if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) {
+      unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND
+                                                       : ISD::ZERO_EXTEND;
+      return DAG.getNode(ExtOp, SDLoc(N), VT, Src);
+    }
+    if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits())
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src);
+    if (SrcVT == VT)
+      return Src;
+    return DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Src);
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
@@ -7298,7 +7961,7 @@
   if (N0CFP)
     return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0);
 
-  return SDValue();
+  return FoldIntToFPToInt(N, DAG);
 }
 
 SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
@@ -7310,7 +7973,7 @@
   if (N0CFP)
     return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0);
 
-  return SDValue();
+  return FoldIntToFPToInt(N, DAG);
 }
 
 SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
@@ -7329,11 +7992,16 @@
 
   // fold (fp_round (fp_round x)) -> (fp_round x)
   if (N0.getOpcode() == ISD::FP_ROUND) {
-    // This is a value preserving truncation if both round's are.
-    bool IsTrunc = N->getConstantOperandVal(1) == 1 &&
-                   N0.getNode()->getConstantOperandVal(1) == 1;
-    return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0.getOperand(0),
-                       DAG.getIntPtrConstant(IsTrunc));
+    const bool NIsTrunc = N->getConstantOperandVal(1) == 1;
+    const bool N0IsTrunc = N0.getNode()->getConstantOperandVal(1) == 1;
+    // If the first fp_round isn't a value preserving truncation, it might
+    // introduce a tie in the second fp_round, that wouldn't occur in the
+    // single-step fp_round we want to fold to.
+    // In other words, double rounding isn't the same as rounding.
+    // Also, this is a value preserving truncation iff both fp_round's are.
+    if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc)
+      return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0.getOperand(0),
+                         DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc));
   }
 
   // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
@@ -7391,7 +8059,7 @@
 
   // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
   if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
-       TLI.isLoadExtLegal(ISD::EXTLOAD, N0.getValueType())) {
+       TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
@@ -8923,7 +9591,7 @@
   if (NotMaskLZ == 64) return Result;  // All zero mask.
 
   // See if we have a continuous run of bits.  If so, we have 0*1+0*
-  if (CountTrailingOnes_64(NotMask >> NotMaskTZ)+NotMaskTZ+NotMaskLZ != 64)
+  if (countTrailingOnes(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64)
     return Result;
 
   // Adjust NotMaskLZ down to be from the actual size of the int instead of i64.
@@ -9070,9 +9738,12 @@
     unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1;
     unsigned NewBW = NextPowerOf2(MSB - ShAmt);
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
+    // The narrowing should be profitable, the load/store operation should be
+    // legal (or custom) and the store size should be equal to the NewVT width.
     while (NewBW < BitWidth &&
-           !(TLI.isOperationLegalOrCustom(Opc, NewVT) &&
-             TLI.isNarrowingProfitable(VT, NewVT))) {
+           (NewVT.getStoreSizeInBits() != NewBW ||
+            !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
+            !TLI.isNarrowingProfitable(VT, NewVT))) {
       NewBW = NextPowerOf2(NewBW);
       NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
     }
@@ -9272,36 +9943,139 @@
   }
 };
 
-/// Holds a pointer to an LSBaseSDNode as well as information on where it
-/// is located in a sequence of memory operations connected by a chain.
-struct MemOpLink {
-  MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq):
-    MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { }
-  // Ptr to the mem node.
-  LSBaseSDNode *MemNode;
-  // Offset from the base ptr.
-  int64_t OffsetFromBase;
-  // What is the sequence number of this mem node.
-  // Lowest mem operand in the DAG starts at zero.
-  unsigned SequenceNum;
-};
+bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
+                  SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT,
+                  unsigned NumElem, bool IsConstantSrc, bool UseVector) {
+  // Make sure we have something to merge.
+  if (NumElem < 2)
+    return false;
+
+  int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
+  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+  unsigned EarliestNodeUsed = 0;
+
+  for (unsigned i=0; i < NumElem; ++i) {
+    // Find a chain for the new wide-store operand. Notice that some
+    // of the store nodes that we found may not be selected for inclusion
+    // in the wide store. The chain we use needs to be the chain of the
+    // earliest store node which is *used* and replaced by the wide store.
+    if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum)
+      EarliestNodeUsed = i;
+  }
+
+  // The earliest Node in the DAG.
+  LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode;
+  SDLoc DL(StoreNodes[0].MemNode);
+
+  SDValue StoredVal;
+  if (UseVector) {
+    // Find a legal type for the vector store.
+    EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
+    assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
+    if (IsConstantSrc) {
+      // A vector store with a constant source implies that the constant is
+      // zero; we only handle merging stores of constant zeros because the zero
+      // can be materialized without a load.
+      // It may be beneficial to loosen this restriction to allow non-zero
+      // store merging.
+      StoredVal = DAG.getConstant(0, Ty);
+    } else {
+      SmallVector<SDValue, 8> Ops;
+      for (unsigned i = 0; i < NumElem ; ++i) {
+        StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+        SDValue Val = St->getValue();
+        // All of the operands of a BUILD_VECTOR must have the same type.
+        if (Val.getValueType() != MemVT)
+          return false;
+        Ops.push_back(Val);
+      }
+
+      // Build the extracted vector elements back into a vector.
+      StoredVal = DAG.getNode(ISD::BUILD_VECTOR, DL, Ty, Ops);
+    }
+  } else {
+    // We should always use a vector store when merging extracted vector
+    // elements, so this path implies a store of constants.
+    assert(IsConstantSrc && "Merged vector elements should use vector store");
+
+    unsigned StoreBW = NumElem * ElementSizeBytes * 8;
+    APInt StoreInt(StoreBW, 0);
+
+    // Construct a single integer constant which is made of the smaller
+    // constant inputs.
+    bool IsLE = TLI.isLittleEndian();
+    for (unsigned i = 0; i < NumElem ; ++i) {
+      unsigned Idx = IsLE ? (NumElem - 1 - i) : i;
+      StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
+      SDValue Val = St->getValue();
+      StoreInt <<= ElementSizeBytes*8;
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
+        StoreInt |= C->getAPIntValue().zext(StoreBW);
+      } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
+        StoreInt |= C->getValueAPF().bitcastToAPInt().zext(StoreBW);
+      } else {
+        llvm_unreachable("Invalid constant element type");
+      }
+    }
+
+    // Create the new Load and Store operations.
+    EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
+    StoredVal = DAG.getConstant(StoreInt, StoreTy);
+  }
+
+  SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, StoredVal,
+                                  FirstInChain->getBasePtr(),
+                                  FirstInChain->getPointerInfo(),
+                                  false, false,
+                                  FirstInChain->getAlignment());
+
+  // Replace the first store with the new store
+  CombineTo(EarliestOp, NewStore);
+  // Erase all other stores.
+  for (unsigned i = 0; i < NumElem ; ++i) {
+    if (StoreNodes[i].MemNode == EarliestOp)
+      continue;
+    StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+    // ReplaceAllUsesWith will replace all uses that existed when it was
+    // called, but graph optimizations may cause new ones to appear. For
+    // example, the case in pr14333 looks like
+    //
+    //  St's chain -> St -> another store -> X
+    //
+    // And the only difference from St to the other store is the chain.
+    // When we change it's chain to be St's chain they become identical,
+    // get CSEed and the net result is that X is now a use of St.
+    // Since we know that St is redundant, just iterate.
+    while (!St->use_empty())
+      DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
+    deleteAndRecombine(St);
+  }
+
+  return true;
+}
 
 bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
+  if (OptLevel == CodeGenOpt::None)
+    return false;
+
   EVT MemVT = St->getMemoryVT();
   int64_t ElementSizeBytes = MemVT.getSizeInBits()/8;
-  bool NoVectors = DAG.getMachineFunction().getFunction()->getAttributes().
-    hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
+  bool NoVectors = DAG.getMachineFunction().getFunction()->hasFnAttribute(
+      Attribute::NoImplicitFloat);
 
   // Don't merge vectors into wider inputs.
   if (MemVT.isVector() || !MemVT.isSimple())
     return false;
 
   // Perform an early exit check. Do not bother looking at stored values that
-  // are not constants or loads.
+  // are not constants, loads, or extracted vector elements.
   SDValue StoredVal = St->getValue();
   bool IsLoadSrc = isa<LoadSDNode>(StoredVal);
-  if (!isa<ConstantSDNode>(StoredVal) && !isa<ConstantFPSDNode>(StoredVal) &&
-      !IsLoadSrc)
+  bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) ||
+                       isa<ConstantFPSDNode>(StoredVal);
+  bool IsExtractVecEltSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT);
+
+  if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecEltSrc)
     return false;
 
   // Only look at ends of store sequences.
@@ -9443,7 +10217,7 @@
   LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
 
   // Store the constants into memory as one consecutive store.
-  if (!IsLoadSrc) {
+  if (IsConstantSrc) {
     unsigned LastLegalType = 0;
     unsigned LastLegalVectorType = 0;
     bool NonZero = false;
@@ -9492,85 +10266,33 @@
     bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
     unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType;
 
-    // Make sure we have something to merge.
-    if (NumElem < 2)
-      return false;
+    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
+                                           true, UseVector);
+  }
 
-    unsigned EarliestNodeUsed = 0;
-    for (unsigned i=0; i < NumElem; ++i) {
-      // Find a chain for the new wide-store operand. Notice that some
-      // of the store nodes that we found may not be selected for inclusion
-      // in the wide store. The chain we use needs to be the chain of the
-      // earliest store node which is *used* and replaced by the wide store.
-      if (StoreNodes[i].SequenceNum > StoreNodes[EarliestNodeUsed].SequenceNum)
-        EarliestNodeUsed = i;
-    }
+  // When extracting multiple vector elements, try to store them
+  // in one vector store rather than a sequence of scalar stores.
+  if (IsExtractVecEltSrc) {
+    unsigned NumElem = 0;
+    for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) {
+      StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
+      SDValue StoredVal = St->getValue();
+      // This restriction could be loosened.
+      // Bail out if any stored values are not elements extracted from a vector.
+      // It should be possible to handle mixed sources, but load sources need
+      // more careful handling (see the block of code below that handles
+      // consecutive loads).
+      if (StoredVal.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+        return false;
 
-    // The earliest Node in the DAG.
-    LSBaseSDNode *EarliestOp = StoreNodes[EarliestNodeUsed].MemNode;
-    SDLoc DL(StoreNodes[0].MemNode);
-
-    SDValue StoredVal;
-    if (UseVector) {
       // Find a legal type for the vector store.
-      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, NumElem);
-      assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
-      StoredVal = DAG.getConstant(0, Ty);
-    } else {
-      unsigned StoreBW = NumElem * ElementSizeBytes * 8;
-      APInt StoreInt(StoreBW, 0);
-
-      // Construct a single integer constant which is made of the smaller
-      // constant inputs.
-      bool IsLE = TLI.isLittleEndian();
-      for (unsigned i = 0; i < NumElem ; ++i) {
-        unsigned Idx = IsLE ?(NumElem - 1 - i) : i;
-        StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
-        SDValue Val = St->getValue();
-        StoreInt<<=ElementSizeBytes*8;
-        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
-          StoreInt|=C->getAPIntValue().zext(StoreBW);
-        } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
-          StoreInt|= C->getValueAPF().bitcastToAPInt().zext(StoreBW);
-        } else {
-          assert(false && "Invalid constant element type");
-        }
-      }
-
-      // Create the new Load and Store operations.
-      EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), StoreBW);
-      StoredVal = DAG.getConstant(StoreInt, StoreTy);
+      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT, i+1);
+      if (TLI.isTypeLegal(Ty))
+        NumElem = i + 1;
     }
 
-    SDValue NewStore = DAG.getStore(EarliestOp->getChain(), DL, StoredVal,
-                                    FirstInChain->getBasePtr(),
-                                    FirstInChain->getPointerInfo(),
-                                    false, false,
-                                    FirstInChain->getAlignment());
-
-    // Replace the first store with the new store
-    CombineTo(EarliestOp, NewStore);
-    // Erase all other stores.
-    for (unsigned i = 0; i < NumElem ; ++i) {
-      if (StoreNodes[i].MemNode == EarliestOp)
-        continue;
-      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
-      // ReplaceAllUsesWith will replace all uses that existed when it was
-      // called, but graph optimizations may cause new ones to appear. For
-      // example, the case in pr14333 looks like
-      //
-      //  St's chain -> St -> another store -> X
-      //
-      // And the only difference from St to the other store is the chain.
-      // When we change it's chain to be St's chain they become identical,
-      // get CSEed and the net result is that X is now a use of St.
-      // Since we know that St is redundant, just iterate.
-      while (!St->use_empty())
-        DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
-      deleteAndRecombine(St);
-    }
-
-    return true;
+    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
+                                           false, true);
   }
 
   // Below we handle the case of multiple consecutive stores that
@@ -9668,9 +10390,9 @@
       EVT LegalizedStoredValueTy =
         TLI.getTypeToTransformTo(*DAG.getContext(), StoreTy);
       if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
-          TLI.isLoadExtLegal(ISD::ZEXTLOAD, StoreTy) &&
-          TLI.isLoadExtLegal(ISD::SEXTLOAD, StoreTy) &&
-          TLI.isLoadExtLegal(ISD::EXTLOAD, StoreTy))
+          TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, StoreTy) &&
+          TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) &&
+          TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy))
         LastLegalIntegerType = i+1;
     }
   }
@@ -10108,7 +10830,8 @@
   if (ResultVT.bitsGT(VecEltVT)) {
     // If the result type of vextract is wider than the load, then issue an
     // extending load instead.
-    ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, VecEltVT)
+    ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT,
+                                                  VecEltVT)
                                    ? ISD::ZEXTLOAD
                                    : ISD::EXTLOAD;
     Load = DAG.getExtLoad(
@@ -10474,6 +11197,11 @@
   if (!TLI.isOperationLegalOrCustom(Opcode, NVT))
     return SDValue();
 
+  // Just because the floating-point vector type is legal does not necessarily
+  // mean that the corresponding integer vector type is.
+  if (!isTypeLegal(NVT))
+    return SDValue();
+
   SmallVector<SDValue, 8> Opnds;
   for (unsigned i = 0; i != NumInScalars; ++i) {
     SDValue In = N->getOperand(i);
@@ -10519,26 +11247,37 @@
     return SDValue();
 
   SDValue VecIn1, VecIn2;
+  bool UsesZeroVector = false;
   for (unsigned i = 0; i != NumInScalars; ++i) {
+    SDValue Op = N->getOperand(i);
     // Ignore undef inputs.
-    if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
+    if (Op.getOpcode() == ISD::UNDEF) continue;
+
+    // See if we can combine this build_vector into a blend with a zero vector.
+    if (!VecIn2.getNode() && ((Op.getOpcode() == ISD::Constant &&
+        cast<ConstantSDNode>(Op.getNode())->isNullValue()) ||
+        (Op.getOpcode() == ISD::ConstantFP &&
+        cast<ConstantFPSDNode>(Op.getNode())->getValueAPF().isZero()))) {
+      UsesZeroVector = true;
+      continue;
+    }
 
     // If this input is something other than a EXTRACT_VECTOR_ELT with a
     // constant index, bail out.
-    if (N->getOperand(i).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        !isa<ConstantSDNode>(N->getOperand(i).getOperand(1))) {
+    if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        !isa<ConstantSDNode>(Op.getOperand(1))) {
       VecIn1 = VecIn2 = SDValue(nullptr, 0);
       break;
     }
 
     // We allow up to two distinct input vectors.
-    SDValue ExtractedFromVec = N->getOperand(i).getOperand(0);
+    SDValue ExtractedFromVec = Op.getOperand(0);
     if (ExtractedFromVec == VecIn1 || ExtractedFromVec == VecIn2)
       continue;
 
     if (!VecIn1.getNode()) {
       VecIn1 = ExtractedFromVec;
-    } else if (!VecIn2.getNode()) {
+    } else if (!VecIn2.getNode() && !UsesZeroVector) {
       VecIn2 = ExtractedFromVec;
     } else {
       // Too many inputs.
@@ -10549,55 +11288,93 @@
 
   // If everything is good, we can make a shuffle operation.
   if (VecIn1.getNode()) {
+    unsigned InNumElements = VecIn1.getValueType().getVectorNumElements();
     SmallVector<int, 8> Mask;
     for (unsigned i = 0; i != NumInScalars; ++i) {
-      if (N->getOperand(i).getOpcode() == ISD::UNDEF) {
+      unsigned Opcode = N->getOperand(i).getOpcode();
+      if (Opcode == ISD::UNDEF) {
         Mask.push_back(-1);
         continue;
       }
 
+      // Operands can also be zero.
+      if (Opcode != ISD::EXTRACT_VECTOR_ELT) {
+        assert(UsesZeroVector &&
+               (Opcode == ISD::Constant || Opcode == ISD::ConstantFP) &&
+               "Unexpected node found!");
+        Mask.push_back(NumInScalars+i);
+        continue;
+      }
+
       // If extracting from the first vector, just use the index directly.
       SDValue Extract = N->getOperand(i);
       SDValue ExtVal = Extract.getOperand(1);
+      unsigned ExtIndex = cast<ConstantSDNode>(ExtVal)->getZExtValue();
       if (Extract.getOperand(0) == VecIn1) {
-        unsigned ExtIndex = cast<ConstantSDNode>(ExtVal)->getZExtValue();
-        if (ExtIndex > VT.getVectorNumElements())
-          return SDValue();
-
         Mask.push_back(ExtIndex);
         continue;
       }
 
-      // Otherwise, use InIdx + VecSize
-      unsigned Idx = cast<ConstantSDNode>(ExtVal)->getZExtValue();
-      Mask.push_back(Idx+NumInScalars);
+      // Otherwise, use InIdx + InputVecSize
+      Mask.push_back(InNumElements + ExtIndex);
     }
 
+    // Avoid introducing illegal shuffles with zero.
+    if (UsesZeroVector && !TLI.isVectorClearMaskLegal(Mask, VT))
+      return SDValue();
+
     // We can't generate a shuffle node with mismatched input and output types.
     // Attempt to transform a single input vector to the correct type.
     if ((VT != VecIn1.getValueType())) {
-      // We don't support shuffeling between TWO values of different types.
-      if (VecIn2.getNode())
-        return SDValue();
-
-      // We only support widening of vectors which are half the size of the
-      // output registers. For example XMM->YMM widening on X86 with AVX.
-      if (VecIn1.getValueType().getSizeInBits()*2 != VT.getSizeInBits())
-        return SDValue();
-
       // If the input vector type has a different base type to the output
       // vector type, bail out.
-      if (VecIn1.getValueType().getVectorElementType() !=
-          VT.getVectorElementType())
+      EVT VTElemType = VT.getVectorElementType();
+      if ((VecIn1.getValueType().getVectorElementType() != VTElemType) ||
+          (VecIn2.getNode() &&
+           (VecIn2.getValueType().getVectorElementType() != VTElemType)))
         return SDValue();
 
-      // Widen the input vector by adding undef values.
-      VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
-                           VecIn1, DAG.getUNDEF(VecIn1.getValueType()));
+      // If the input vector is too small, widen it.
+      // We only support widening of vectors which are half the size of the
+      // output registers. For example XMM->YMM widening on X86 with AVX.
+      EVT VecInT = VecIn1.getValueType();
+      if (VecInT.getSizeInBits() * 2 == VT.getSizeInBits()) {
+        // If we only have one small input, widen it by adding undef values.
+        if (!VecIn2.getNode())
+          VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, VecIn1,
+                               DAG.getUNDEF(VecIn1.getValueType()));
+        else if (VecIn1.getValueType() == VecIn2.getValueType()) {
+          // If we have two small inputs of the same type, try to concat them.
+          VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, VecIn1, VecIn2);
+          VecIn2 = SDValue(nullptr, 0);
+        } else
+          return SDValue();
+      } else if (VecInT.getSizeInBits() == VT.getSizeInBits() * 2) {
+        // If the input vector is too large, try to split it.
+        // We don't support having two input vectors that are too large.
+        if (VecIn2.getNode())
+          return SDValue();
+
+        if (!TLI.isExtractSubvectorCheap(VT, VT.getVectorNumElements()))
+          return SDValue();
+
+        // Try to replace VecIn1 with two extract_subvectors
+        // No need to update the masks, they should still be correct.
+        VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1,
+          DAG.getConstant(VT.getVectorNumElements(), TLI.getVectorIdxTy()));
+        VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1,
+          DAG.getConstant(0, TLI.getVectorIdxTy()));
+        UsesZeroVector = false;
+      } else
+        return SDValue();
     }
 
-    // If VecIn2 is unused then change it to undef.
-    VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+    if (UsesZeroVector)
+      VecIn2 = VT.isInteger() ? DAG.getConstant(0, VT) :
+                                DAG.getConstantFP(0.0, VT);
+    else
+      // If VecIn2 is unused then change it to undef.
+      VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
 
     // Check that we were able to transform all incoming values to the same
     // type.
@@ -10656,36 +11433,56 @@
     }
   }
 
+  // Fold any combination of BUILD_VECTOR or UNDEF nodes into one BUILD_VECTOR.
+  // We have already tested above for an UNDEF only concatenation.
   // fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...))
   // -> (BUILD_VECTOR A, B, ..., C, D, ...)
-  if (N->getNumOperands() == 2 &&
-      N->getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
-      N->getOperand(1).getOpcode() == ISD::BUILD_VECTOR) {
-    EVT VT = N->getValueType(0);
-    SDValue N0 = N->getOperand(0);
-    SDValue N1 = N->getOperand(1);
+  auto IsBuildVectorOrUndef = [](const SDValue &Op) {
+    return ISD::UNDEF == Op.getOpcode() || ISD::BUILD_VECTOR == Op.getOpcode();
+  };
+  bool AllBuildVectorsOrUndefs =
+      std::all_of(N->op_begin(), N->op_end(), IsBuildVectorOrUndef);
+  if (AllBuildVectorsOrUndefs) {
     SmallVector<SDValue, 8> Opnds;
-    unsigned BuildVecNumElts =  N0.getNumOperands();
+    EVT SVT = VT.getScalarType();
 
-    EVT SclTy0 = N0.getOperand(0)->getValueType(0);
-    EVT SclTy1 = N1.getOperand(0)->getValueType(0);
-    if (SclTy0.isFloatingPoint()) {
-      for (unsigned i = 0; i != BuildVecNumElts; ++i)
-        Opnds.push_back(N0.getOperand(i));
-      for (unsigned i = 0; i != BuildVecNumElts; ++i)
-        Opnds.push_back(N1.getOperand(i));
-    } else {
+    EVT MinVT = SVT;
+    if (!SVT.isFloatingPoint()) {
       // If BUILD_VECTOR are from built from integer, they may have different
-      // operand types. Get the smaller type and truncate all operands to it.
-      EVT MinTy = SclTy0.bitsLE(SclTy1) ? SclTy0 : SclTy1;
-      for (unsigned i = 0; i != BuildVecNumElts; ++i)
-        Opnds.push_back(DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinTy,
-                        N0.getOperand(i)));
-      for (unsigned i = 0; i != BuildVecNumElts; ++i)
-        Opnds.push_back(DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinTy,
-                        N1.getOperand(i)));
+      // operand types. Get the smallest type and truncate all operands to it.
+      bool FoundMinVT = false;
+      for (const SDValue &Op : N->ops())
+        if (ISD::BUILD_VECTOR == Op.getOpcode()) {
+          EVT OpSVT = Op.getOperand(0)->getValueType(0);
+          MinVT = (!FoundMinVT || OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT;
+          FoundMinVT = true;
+        }
+      assert(FoundMinVT && "Concat vector type mismatch");
     }
 
+    for (const SDValue &Op : N->ops()) {
+      EVT OpVT = Op.getValueType();
+      unsigned NumElts = OpVT.getVectorNumElements();
+
+      if (ISD::UNDEF == Op.getOpcode())
+        for (unsigned i = 0; i != NumElts; ++i)
+          Opnds.push_back(DAG.getUNDEF(MinVT));
+
+      if (ISD::BUILD_VECTOR == Op.getOpcode()) {
+        if (SVT.isFloatingPoint()) {
+          assert(SVT == OpVT.getScalarType() && "Concat vector type mismatch");
+          for (unsigned i = 0; i != NumElts; ++i)
+            Opnds.push_back(Op.getOperand(i));
+        } else {
+          for (unsigned i = 0; i != NumElts; ++i)
+            Opnds.push_back(
+                DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinVT, Op.getOperand(i)));
+        }
+      }
+    }
+
+    assert(VT.getVectorNumElements() == Opnds.size() &&
+           "Concat vector type mismatch");
     return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), VT, Opnds);
   }
 
@@ -10881,7 +11678,8 @@
   return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask());
 }
 
-// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat.
+// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,
+// or turn a shuffle of a single concat into simpler shuffle then concat.
 static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
@@ -10895,6 +11693,18 @@
   unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements();
   unsigned NumConcats = NumElts / NumElemsPerConcat;
 
+  // Special case: shuffle(concat(A,B)) can be more efficiently represented
+  // as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high
+  // half vector elements.
+  if (NumElemsPerConcat * 2 == NumElts && N1.getOpcode() == ISD::UNDEF &&
+      std::all_of(SVN->getMask().begin() + NumElemsPerConcat,
+                  SVN->getMask().end(), [](int i) { return i == -1; })) {
+    N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), N0.getOperand(1),
+                              ArrayRef<int>(SVN->getMask().begin(), NumElemsPerConcat));
+    N1 = DAG.getUNDEF(ConcatVT);
+    return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1);
+  }
+
   // Look at every vector that's inserted. We're looking for exact
   // subvector-sized copies from a concatenated vector
   for (unsigned I = 0; I != NumConcats; ++I) {
@@ -10993,7 +11803,7 @@
   }
 
   // If it is a splat, check if the argument vector is another splat or a
-  // build_vector with all scalar elements the same.
+  // build_vector.
   if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
     SDNode *V = N0.getNode();
 
@@ -11030,6 +11840,18 @@
       // Splat of <x, x, x, x>, return <x, x, x, x>
       if (AllSame)
         return N0;
+
+      // Canonicalize any other splat as a build_vector.
+      const SDValue &Splatted = V->getOperand(SVN->getSplatIndex());
+      SmallVector<SDValue, 8> Ops(NumElts, Splatted);
+      SDValue NewBV = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
+                                  V->getValueType(0), Ops);
+
+      // We may have jumped through bitcasts, so the type of the
+      // BUILD_VECTOR may not match the type of the shuffle.
+      if (V->getValueType(0) != VT)
+          NewBV = DAG.getNode(ISD::BITCAST, SDLoc(N), VT, NewBV);
+      return NewBV;
     }
   }
 
@@ -11050,121 +11872,11 @@
       return V;
   }
 
-  // If this shuffle node is simply a swizzle of another shuffle node,
-  // then try to simplify it.
-  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
-      N1.getOpcode() == ISD::UNDEF) {
-
-    ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
-
-    // The incoming shuffle must be of the same type as the result of the
-    // current shuffle.
-    assert(OtherSV->getOperand(0).getValueType() == VT &&
-           "Shuffle types don't match");
-
-    SmallVector<int, 4> Mask;
-    // Compute the combined shuffle mask.
-    for (unsigned i = 0; i != NumElts; ++i) {
-      int Idx = SVN->getMaskElt(i);
-      assert(Idx < (int)NumElts && "Index references undef operand");
-      // Next, this index comes from the first value, which is the incoming
-      // shuffle. Adopt the incoming index.
-      if (Idx >= 0)
-        Idx = OtherSV->getMaskElt(Idx);
-      Mask.push_back(Idx);
-    }
-
-    // Check if all indices in Mask are Undef. In case, propagate Undef.
-    bool isUndefMask = true;
-    for (unsigned i = 0; i != NumElts && isUndefMask; ++i)
-      isUndefMask &= Mask[i] < 0;
-
-    if (isUndefMask)
-      return DAG.getUNDEF(VT);
-
-    bool CommuteOperands = false;
-    if (N0.getOperand(1).getOpcode() != ISD::UNDEF) {
-      // To be valid, the combine shuffle mask should only reference elements
-      // from one of the two vectors in input to the inner shufflevector.
-      bool IsValidMask = true;
-      for (unsigned i = 0; i != NumElts && IsValidMask; ++i)
-        // See if the combined mask only reference undefs or elements coming
-        // from the first shufflevector operand.
-        IsValidMask = Mask[i] < 0 || (unsigned)Mask[i] < NumElts;
-
-      if (!IsValidMask) {
-        IsValidMask = true;
-        for (unsigned i = 0; i != NumElts && IsValidMask; ++i)
-          // Check that all the elements come from the second shuffle operand.
-          IsValidMask = Mask[i] < 0 || (unsigned)Mask[i] >= NumElts;
-        CommuteOperands = IsValidMask;
-      }
-
-      // Early exit if the combined shuffle mask is not valid.
-      if (!IsValidMask)
-        return SDValue();
-    }
-
-    // See if this pair of shuffles can be safely folded according to either
-    // of the following rules:
-    //   shuffle(shuffle(x, y), undef) -> x
-    //   shuffle(shuffle(x, undef), undef) -> x
-    //   shuffle(shuffle(x, y), undef) -> y
-    bool IsIdentityMask = true;
-    unsigned BaseMaskIndex = CommuteOperands ? NumElts : 0;
-    for (unsigned i = 0; i != NumElts && IsIdentityMask; ++i) {
-      // Skip Undefs.
-      if (Mask[i] < 0)
-        continue;
-
-      // The combined shuffle must map each index to itself.
-      IsIdentityMask = (unsigned)Mask[i] == i + BaseMaskIndex;
-    }
-
-    if (IsIdentityMask) {
-      if (CommuteOperands)
-        // optimize shuffle(shuffle(x, y), undef) -> y.
-        return OtherSV->getOperand(1);
-
-      // optimize shuffle(shuffle(x, undef), undef) -> x
-      // optimize shuffle(shuffle(x, y), undef) -> x
-      return OtherSV->getOperand(0);
-    }
-
-    // It may still be beneficial to combine the two shuffles if the
-    // resulting shuffle is legal.
-    if (TLI.isTypeLegal(VT)) {
-      if (!CommuteOperands) {
-        if (TLI.isShuffleMaskLegal(Mask, VT))
-          // shuffle(shuffle(x, undef, M1), undef, M2) -> shuffle(x, undef, M3).
-          // shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(x, undef, M3)
-          return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(0), N1,
-                                      &Mask[0]);
-      } else {
-        // Compute the commuted shuffle mask.
-        for (unsigned i = 0; i != NumElts; ++i) {
-          int idx = Mask[i];
-          if (idx < 0)
-            continue;
-          else if (idx < (int)NumElts)
-            Mask[i] = idx + NumElts;
-          else
-            Mask[i] = idx - NumElts;
-        }
-
-        if (TLI.isShuffleMaskLegal(Mask, VT))
-          //   shuffle(shuffle(x, y, M1), undef, M2) -> shuffle(y, undef, M3)
-          return DAG.getVectorShuffle(VT, SDLoc(N), N0->getOperand(1), N1,
-                                      &Mask[0]);
-      }
-    }
-  }
-
   // Canonicalize shuffles according to rules:
   //  shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A)
   //  shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B)
   //  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
-  if (N1.getOpcode() == ISD::VECTOR_SHUFFLE && N0.getOpcode() != ISD::UNDEF &&
+  if (N1.getOpcode() == ISD::VECTOR_SHUFFLE &&
       N0.getOpcode() != ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
       TLI.isTypeLegal(VT)) {
     // The incoming shuffle must be of the same type as the result of the
@@ -11183,13 +11895,13 @@
   }
 
   // Try to fold according to rules:
-  //   shuffle(shuffle(A, B, M0), B, M1) -> shuffle(A, B, M2)
-  //   shuffle(shuffle(A, B, M0), A, M1) -> shuffle(A, B, M2)
-  //   shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(A, B, M2)
-  //   shuffle(shuffle(A, Undef, M0), A, M1) -> shuffle(A, Undef, M2)
+  //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
+  //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
+  //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
   // Don't try to fold shuffles with illegal type.
-  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
-      N1.getOpcode() != ISD::UNDEF && TLI.isTypeLegal(VT)) {
+  // Only fold if this shuffle is the only user of the other shuffle.
+  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && N->isOnlyUserOf(N0.getNode()) &&
+      Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
     ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
 
     // The incoming shuffle must be of the same type as the result of the
@@ -11197,14 +11909,7 @@
     assert(OtherSV->getOperand(0).getValueType() == VT &&
            "Shuffle types don't match");
 
-    SDValue SV0 = OtherSV->getOperand(0);
-    SDValue SV1 = OtherSV->getOperand(1);
-    bool HasSameOp0 = N1 == SV0;
-    bool IsSV1Undef = SV1.getOpcode() == ISD::UNDEF;
-    if (!HasSameOp0 && !IsSV1Undef && N1 != SV1)
-      // Early exit.
-      return SDValue();
-
+    SDValue SV0, SV1;
     SmallVector<int, 4> Mask;
     // Compute the combined shuffle mask for a shuffle with SV0 as the first
     // operand, and SV1 as the second operand.
@@ -11216,14 +11921,49 @@
         continue;
       }
 
+      SDValue CurrentVec;
       if (Idx < (int)NumElts) {
+        // This shuffle index refers to the inner shuffle N0. Lookup the inner
+        // shuffle mask to identify which vector is actually referenced.
         Idx = OtherSV->getMaskElt(Idx);
-        if (IsSV1Undef && Idx >= (int) NumElts)
-          Idx = -1;  // Propagate Undef.
-      } else
-        Idx = HasSameOp0 ? Idx - NumElts : Idx;
+        if (Idx < 0) {
+          // Propagate Undef.
+          Mask.push_back(Idx);
+          continue;
+        }
 
-      Mask.push_back(Idx);
+        CurrentVec = (Idx < (int) NumElts) ? OtherSV->getOperand(0)
+                                           : OtherSV->getOperand(1);
+      } else {
+        // This shuffle index references an element within N1.
+        CurrentVec = N1;
+      }
+
+      // Simple case where 'CurrentVec' is UNDEF.
+      if (CurrentVec.getOpcode() == ISD::UNDEF) {
+        Mask.push_back(-1);
+        continue;
+      }
+
+      // Canonicalize the shuffle index. We don't know yet if CurrentVec
+      // will be the first or second operand of the combined shuffle.
+      Idx = Idx % NumElts;
+      if (!SV0.getNode() || SV0 == CurrentVec) {
+        // Ok. CurrentVec is the left hand side.
+        // Update the mask accordingly.
+        SV0 = CurrentVec;
+        Mask.push_back(Idx);
+        continue;
+      }
+
+      // Bail out if we cannot convert the shuffle pair into a single shuffle.
+      if (SV1.getNode() && SV1 != CurrentVec)
+        return SDValue();
+
+      // Ok. CurrentVec is the right hand side.
+      // Update the mask accordingly.
+      SV1 = CurrentVec;
+      Mask.push_back(Idx + NumElts);
     }
 
     // Check if all indices in Mask are Undef. In case, propagate Undef.
@@ -11234,34 +11974,37 @@
     if (isUndefMask)
       return DAG.getUNDEF(VT);
 
+    if (!SV0.getNode())
+      SV0 = DAG.getUNDEF(VT);
+    if (!SV1.getNode())
+      SV1 = DAG.getUNDEF(VT);
+
     // Avoid introducing shuffles with illegal mask.
-    if (TLI.isShuffleMaskLegal(Mask, VT)) {
-      if (IsSV1Undef)
-        //   shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(A, B, M2)
-        //   shuffle(shuffle(A, Undef, M0), A, M1) -> shuffle(A, Undef, M2)
-        return DAG.getVectorShuffle(VT, SDLoc(N), SV0, N1, &Mask[0]);
-      return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, &Mask[0]);
+    if (!TLI.isShuffleMaskLegal(Mask, VT)) {
+      // Compute the commuted shuffle mask and test again.
+      for (unsigned i = 0; i != NumElts; ++i) {
+        int idx = Mask[i];
+        if (idx < 0)
+          continue;
+        else if (idx < (int)NumElts)
+          Mask[i] = idx + NumElts;
+        else
+          Mask[i] = idx - NumElts;
+      }
+
+      if (!TLI.isShuffleMaskLegal(Mask, VT))
+        return SDValue();
+
+      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2)
+      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2)
+      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2)
+      std::swap(SV0, SV1);
     }
 
-    // Compute the commuted shuffle mask.
-    for (unsigned i = 0; i != NumElts; ++i) {
-      int idx = Mask[i];
-      if (idx < 0)
-        continue;
-      else if (idx < (int)NumElts)
-        Mask[i] = idx + NumElts;
-      else
-        Mask[i] = idx - NumElts;
-    }
-
-    if (TLI.isShuffleMaskLegal(Mask, VT)) {
-      if (IsSV1Undef)
-        //   shuffle(shuffle(A, Undef, M0), B, M1) -> shuffle(B, A, M2)
-        return DAG.getVectorShuffle(VT, SDLoc(N), N1, SV0, &Mask[0]);
-      //   shuffle(shuffle(A, B, M0), B, M1) -> shuffle(B, A, M2)
-      //   shuffle(shuffle(A, B, M0), A, M1) -> shuffle(B, A, M2)
-      return DAG.getVectorShuffle(VT, SDLoc(N), SV1, SV0, &Mask[0]);
-    }
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
+    return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, &Mask[0]);
   }
 
   return SDValue();
@@ -11322,9 +12065,11 @@
           return SDValue();
       }
 
-      // Let's see if the target supports this vector_shuffle.
+      // Let's see if the target supports this vector_shuffle and make sure
+      // we're not running after operation legalization where it may have
+      // custom lowered the vector shuffles.
       EVT RVT = RHS.getValueType();
-      if (!TLI.isVectorClearMaskLegal(Indices, RVT))
+      if (LegalOperations || !TLI.isVectorClearMaskLegal(Indices, RVT))
         return SDValue();
 
       // Return the new VECTOR_SHUFFLE node.

diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 8facbc2..1df4a1d 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp

@@ -40,12 +40,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/FastISel.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -62,7 +63,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -728,6 +728,7 @@
   // For AnyRegCC the arguments are lowered later on manually.
   unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
   CallLoweringInfo CLI;
+  CLI.setIsPatchPoint();
   if (!lowerCallOperands(I, NumMetaOpers, NumCallArgs, Callee, IsAnyRegCC, CLI))
     return false;
 
@@ -1579,7 +1580,7 @@
                    bool SkipTargetIndependentISel)
     : FuncInfo(FuncInfo), MF(FuncInfo.MF), MRI(FuncInfo.MF->getRegInfo()),
       MFI(*FuncInfo.MF->getFrameInfo()), MCP(*FuncInfo.MF->getConstantPool()),
-      TM(FuncInfo.MF->getTarget()), DL(*MF->getSubtarget().getDataLayout()),
+      TM(FuncInfo.MF->getTarget()), DL(*TM.getDataLayout()),
       TII(*MF->getSubtarget().getInstrInfo()),
       TLI(*MF->getSubtarget().getTargetLowering()),
       TRI(*MF->getSubtarget().getRegisterInfo()), LibInfo(LibInfo),

diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 86b9542..7e72dc6 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp

@@ -133,16 +133,17 @@
         ImmutableCallSite CS(I);
         if (isa<InlineAsm>(CS.getCalledValue())) {
           unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
+          const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
           std::vector<TargetLowering::AsmOperandInfo> Ops =
-            TLI->ParseConstraints(CS);
+              TLI->ParseConstraints(TRI, CS);
           for (size_t I = 0, E = Ops.size(); I != E; ++I) {
             TargetLowering::AsmOperandInfo &Op = Ops[I];
             if (Op.Type == InlineAsm::isClobber) {
               // Clobbers don't have SDValue operands, hence SDValue().
               TLI->ComputeConstraintToUse(Op, SDValue(), DAG);
               std::pair<unsigned, const TargetRegisterClass *> PhysReg =
-                  TLI->getRegForInlineAsmConstraint(Op.ConstraintCode,
-                                                   Op.ConstraintVT);
+                  TLI->getRegForInlineAsmConstraint(TRI, Op.ConstraintCode,
+                                                    Op.ConstraintVT);
               if (PhysReg.first == SP)
                 MF->getFrameInfo()->setHasInlineAsmWithSPAdjust(true);
             }
@@ -273,6 +274,7 @@
   ArgDbgValues.clear();
   ByValArgFrameIndexMap.clear();
   RegFixups.clear();
+  StatepointStackSlots.clear();
   PreferredExtendType.clear();
 }
 
@@ -470,60 +472,6 @@
   }
 }
 
-/// AddCatchInfo - Extract the personality and type infos from an eh.selector
-/// call, and add them to the specified machine basic block.
-void llvm::AddCatchInfo(const CallInst &I, MachineModuleInfo *MMI,
-                        MachineBasicBlock *MBB) {
-  // Inform the MachineModuleInfo of the personality for this landing pad.
-  const ConstantExpr *CE = cast<ConstantExpr>(I.getArgOperand(1));
-  assert(CE->getOpcode() == Instruction::BitCast &&
-         isa<Function>(CE->getOperand(0)) &&
-         "Personality should be a function");
-  MMI->addPersonality(MBB, cast<Function>(CE->getOperand(0)));
-
-  // Gather all the type infos for this landing pad and pass them along to
-  // MachineModuleInfo.
-  std::vector<const GlobalValue *> TyInfo;
-  unsigned N = I.getNumArgOperands();
-
-  for (unsigned i = N - 1; i > 1; --i) {
-    if (const ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(i))) {
-      unsigned FilterLength = CI->getZExtValue();
-      unsigned FirstCatch = i + FilterLength + !FilterLength;
-      assert(FirstCatch <= N && "Invalid filter length");
-
-      if (FirstCatch < N) {
-        TyInfo.reserve(N - FirstCatch);
-        for (unsigned j = FirstCatch; j < N; ++j)
-          TyInfo.push_back(ExtractTypeInfo(I.getArgOperand(j)));
-        MMI->addCatchTypeInfo(MBB, TyInfo);
-        TyInfo.clear();
-      }
-
-      if (!FilterLength) {
-        // Cleanup.
-        MMI->addCleanup(MBB);
-      } else {
-        // Filter.
-        TyInfo.reserve(FilterLength - 1);
-        for (unsigned j = i + 1; j < FirstCatch; ++j)
-          TyInfo.push_back(ExtractTypeInfo(I.getArgOperand(j)));
-        MMI->addFilterTypeInfo(MBB, TyInfo);
-        TyInfo.clear();
-      }
-
-      N = i;
-    }
-  }
-
-  if (N > 2) {
-    TyInfo.reserve(N - 2);
-    for (unsigned j = 2; j < N; ++j)
-      TyInfo.push_back(ExtractTypeInfo(I.getArgOperand(j)));
-    MMI->addCatchTypeInfo(MBB, TyInfo);
-  }
-}
-
 /// AddLandingPadInfo - Extract the exception handling information from the
 /// landingpad instruction and add them to the specified machine module info.
 void llvm::AddLandingPadInfo(const LandingPadInst &I, MachineModuleInfo &MMI,

diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index a65f33e..93699a7 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp

@@ -406,10 +406,10 @@
     Type *Type = CP->getType();
     // MachineConstantPool wants an explicit alignment.
     if (Align == 0) {
-      Align = MF->getSubtarget().getDataLayout()->getPrefTypeAlignment(Type);
+      Align = MF->getTarget().getDataLayout()->getPrefTypeAlignment(Type);
       if (Align == 0) {
         // Alignment of vector types.  FIXME!
-        Align = MF->getSubtarget().getDataLayout()->getTypeAllocSize(Type);
+        Align = MF->getTarget().getDataLayout()->getTypeAllocSize(Type);
       }
     }
 

diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 5d17a5f..61c0a6f 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp

@@ -40,7 +40,7 @@
 #define DEBUG_TYPE "legalizedag"
 
 //===----------------------------------------------------------------------===//
-/// SelectionDAGLegalize - This takes an arbitrary SelectionDAG as input and
+/// This takes an arbitrary SelectionDAG as input and
 /// hacks on it until the target machine can handle it.  This involves
 /// eliminating value sizes the machine cannot handle (promoting small sizes to
 /// large sizes or splitting up large values into small values) as well as
@@ -86,7 +86,7 @@
   void LegalizeLoadOps(SDNode *Node);
   void LegalizeStoreOps(SDNode *Node);
 
-  /// PerformInsertVectorEltInMemory - Some target cannot handle a variable
+  /// Some targets cannot handle a variable
   /// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
   /// is necessary to spill the vector being inserted into to memory, perform
   /// the insert there, and then read the result back.
@@ -95,7 +95,7 @@
   SDValue ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
                                   SDValue Idx, SDLoc dl);
 
-  /// ShuffleWithNarrowerEltType - Return a vector shuffle operation which
+  /// Return a vector shuffle operation which
   /// performs the same shuffe in terms of order or result bytes, but on a type
   /// whose vector element type is narrower than the original shuffle type.
   /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
@@ -200,7 +200,7 @@
 };
 }
 
-/// ShuffleWithNarrowerEltType - Return a vector shuffle operation which
+/// Return a vector shuffle operation which
 /// performs the same shuffe in terms of order or result bytes, but on a type
 /// whose vector element type is narrower than the original shuffle type.
 /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
@@ -232,7 +232,7 @@
   return DAG.getVectorShuffle(NVT, dl, N1, N2, &NewMask[0]);
 }
 
-/// ExpandConstantFP - Expands the ConstantFP node to an integer constant or
+/// Expands the ConstantFP node to an integer constant or
 /// a load from the constant pool.
 SDValue
 SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
@@ -260,7 +260,7 @@
     if (ConstantFPSDNode::isValueValidForType(SVT, CFP->getValueAPF()) &&
         // Only do this if the target has a native EXTLOAD instruction from
         // smaller type.
-        TLI.isLoadExtLegal(ISD::EXTLOAD, SVT) &&
+        TLI.isLoadExtLegal(ISD::EXTLOAD, OrigVT, SVT) &&
         TLI.ShouldShrinkFPConstant(OrigVT)) {
       Type *SType = SVT.getTypeForEVT(*DAG.getContext());
       LLVMC = cast<ConstantFP>(ConstantExpr::getFPTrunc(LLVMC, SType));
@@ -286,7 +286,7 @@
   return Result;
 }
 
-/// ExpandUnalignedStore - Expands an unaligned store to 2 half-size stores.
+/// Expands an unaligned store to 2 half-size stores.
 static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
                                  const TargetLowering &TLI,
                                  SelectionDAGLegalize *DAGLegalize) {
@@ -409,7 +409,7 @@
   DAGLegalize->ReplaceNode(SDValue(ST, 0), Result);
 }
 
-/// ExpandUnalignedLoad - Expands an unaligned load to 2 half-size loads.
+/// Expands an unaligned load to 2 half-size loads.
 static void
 ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
                     const TargetLowering &TLI,
@@ -561,8 +561,8 @@
   ChainResult = TF;
 }
 
-/// PerformInsertVectorEltInMemory - Some target cannot handle a variable
-/// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
+/// Some target cannot handle a variable insertion index for the
+/// INSERT_VECTOR_ELT instruction.  In this case, it
 /// is necessary to spill the vector being inserted into to memory, perform
 /// the insert there, and then read the result back.
 SDValue SelectionDAGLegalize::
@@ -725,14 +725,13 @@
             Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
             unsigned ABIAlignment= TLI.getDataLayout()->getABITypeAlignment(Ty);
             if (Align < ABIAlignment)
-              ExpandUnalignedStore(cast<StoreSDNode>(Node),
-                                   DAG, TLI, this);
+              ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
           }
           break;
         }
         case TargetLowering::Custom: {
           SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
-          if (Res.getNode())
+          if (Res && Res != SDValue(Node, 0))
             ReplaceNode(SDValue(Node, 0), Res);
           return;
         }
@@ -766,8 +765,7 @@
         Value = DAG.getZeroExtendInReg(Value, dl, StVT);
         SDValue Result =
           DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
-                            NVT, isVolatile, isNonTemporal, Alignment,
-                            AAInfo);
+                            NVT, isVolatile, isNonTemporal, Alignment, AAInfo);
         ReplaceNode(SDValue(Node, 0), Result);
       } else if (StWidth & (StWidth - 1)) {
         // If not storing a power-of-2 number of bits, expand as two stores.
@@ -845,7 +843,7 @@
         }
         case TargetLowering::Custom: {
           SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
-          if (Res.getNode())
+          if (Res && Res != SDValue(Node, 0))
             ReplaceNode(SDValue(Node, 0), Res);
           return;
         }
@@ -946,7 +944,8 @@
       // nice to have an effective generic way of getting these benefits...
       // Until such a way is found, don't insist on promoting i1 here.
       (SrcVT != MVT::i1 ||
-       TLI.getLoadExtAction(ExtType, MVT::i1) == TargetLowering::Promote)) {
+       TLI.getLoadExtAction(ExtType, Node->getValueType(0), MVT::i1) ==
+         TargetLowering::Promote)) {
     // Promote to a byte-sized load if not loading an integral number of
     // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
     unsigned NewWidth = SrcVT.getStoreSizeInBits();
@@ -1058,7 +1057,8 @@
     Chain = Ch;
   } else {
     bool isCustom = false;
-    switch (TLI.getLoadExtAction(ExtType, SrcVT.getSimpleVT())) {
+    switch (TLI.getLoadExtAction(ExtType, Node->getValueType(0),
+                                 SrcVT.getSimpleVT())) {
     default: llvm_unreachable("This action is not supported yet!");
     case TargetLowering::Custom:
       isCustom = true;
@@ -1080,36 +1080,35 @@
         unsigned AS = LD->getAddressSpace();
         unsigned Align = LD->getAlignment();
         if (!TLI.allowsMisalignedMemoryAccesses(MemVT, AS, Align)) {
-          Type *Ty =
-            LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
-          unsigned ABIAlignment =
-            TLI.getDataLayout()->getABITypeAlignment(Ty);
+          Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+          unsigned ABIAlignment = TLI.getDataLayout()->getABITypeAlignment(Ty);
           if (Align < ABIAlignment){
-            ExpandUnalignedLoad(cast<LoadSDNode>(Node),
-                                DAG, TLI, Value, Chain);
+            ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, Value, Chain);
           }
         }
       }
       break;
     }
     case TargetLowering::Expand:
-      if (!TLI.isLoadExtLegal(ISD::EXTLOAD, SrcVT) &&
-          TLI.isTypeLegal(SrcVT)) {
-        SDValue Load = DAG.getLoad(SrcVT, dl, Chain, Ptr,
-                                   LD->getMemOperand());
-        unsigned ExtendOp;
-        switch (ExtType) {
-        case ISD::EXTLOAD:
-          ExtendOp = (SrcVT.isFloatingPoint() ?
-                      ISD::FP_EXTEND : ISD::ANY_EXTEND);
+      if (!TLI.isLoadExtLegal(ISD::EXTLOAD, Node->getValueType(0), SrcVT)) {
+        // If the source type is not legal, see if there is a legal extload to
+        // an intermediate type that we can then extend further.
+        EVT LoadVT = TLI.getRegisterType(SrcVT.getSimpleVT());
+        if (TLI.isTypeLegal(SrcVT) || // Same as SrcVT == LoadVT?
+            TLI.isLoadExtLegal(ExtType, LoadVT, SrcVT)) {
+          // If we are loading a legal type, this is a non-extload followed by a
+          // full extend.
+          ISD::LoadExtType MidExtType =
+              (LoadVT == SrcVT) ? ISD::NON_EXTLOAD : ExtType;
+
+          SDValue Load = DAG.getExtLoad(MidExtType, dl, LoadVT, Chain, Ptr,
+                                        SrcVT, LD->getMemOperand());
+          unsigned ExtendOp =
+              ISD::getExtForLoadExtType(SrcVT.isFloatingPoint(), ExtType);
+          Value = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
+          Chain = Load.getValue(1);
           break;
-        case ISD::SEXTLOAD: ExtendOp = ISD::SIGN_EXTEND; break;
-        case ISD::ZEXTLOAD: ExtendOp = ISD::ZERO_EXTEND; break;
-        default: llvm_unreachable("Unexpected extend load type!");
         }
-        Value = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
-        Chain = Load.getValue(1);
-        break;
       }
 
       assert(!SrcVT.isVector() &&
@@ -1133,8 +1132,7 @@
                              Result.getValueType(),
                              Result, DAG.getValueType(SrcVT));
       else
-        ValRes = DAG.getZeroExtendInReg(Result, dl,
-                                        SrcVT.getScalarType());
+        ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT.getScalarType());
       Value = ValRes;
       Chain = Result.getValue(1);
       break;
@@ -1155,8 +1153,7 @@
   }
 }
 
-/// LegalizeOp - Return a legal replacement for the given operation, with
-/// all legal operands.
+/// Return a legal replacement for the given operation, with all legal operands.
 void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   DEBUG(dbgs() << "\nLegalizing: "; Node->dump(&DAG));
 
@@ -1642,8 +1639,8 @@
   Results.push_back(Tmp2);
 }
 
-/// LegalizeSetCCCondCode - Legalize a SETCC with given LHS and RHS and
-/// condition code CC on the current target.
+/// Legalize a SETCC with given LHS and RHS and condition code CC on the current
+/// target.
 ///
 /// If the SETCC has been legalized using AND / OR, then the legalized node
 /// will be stored in LHS. RHS and CC will be set to SDValue(). NeedInvert
@@ -1757,7 +1754,7 @@
   return false;
 }
 
-/// EmitStackConvert - Emit a store/load combination to the stack.  This stores
+/// Emit a store/load combination to the stack.  This stores
 /// SrcOp to a stack slot of type SlotVT, truncating it if needed.  It then does
 /// a load from the stack slot to DestVT, extending it if needed.
 /// The resultant code need not be legal.
@@ -1917,7 +1914,7 @@
   return true;
 }
 
-/// ExpandBUILD_VECTOR - Expand a BUILD_VECTOR node on targets that don't
+/// Expand a BUILD_VECTOR node on targets that don't
 /// support the operation, but do support the resultant vector type.
 SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
   unsigned NumElems = Node->getNumOperands();
@@ -2029,7 +2026,7 @@
   return ExpandVectorBuildThroughStack(Node);
 }
 
-// ExpandLibCall - Expand a node into a call to a libcall.  If the result value
+// Expand a node into a call to a libcall.  If the result value
 // does not fit into a register, return the lo part and set the hi part to the
 // by-reg argument.  If it does fit into a single register, return the result
 // and leave the Hi part unset.
@@ -2077,7 +2074,7 @@
   return CallInfo.first;
 }
 
-/// ExpandLibCall - Generate a libcall taking the given operands as arguments
+/// Generate a libcall taking the given operands as arguments
 /// and returning a result of type RetVT.
 SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
                                             const SDValue *Ops, unsigned NumOps,
@@ -2108,7 +2105,7 @@
   return CallInfo.first;
 }
 
-// ExpandChainLibCall - Expand a node into a call to a libcall. Similar to
+// Expand a node into a call to a libcall. Similar to
 // ExpandLibCall except that the first operand is the in-chain.
 std::pair<SDValue, SDValue>
 SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
@@ -2178,7 +2175,7 @@
   return ExpandLibCall(LC, Node, isSigned);
 }
 
-/// isDivRemLibcallAvailable - Return true if divmod libcall is available.
+/// Return true if divmod libcall is available.
 static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
                                      const TargetLowering &TLI) {
   RTLIB::Libcall LC;
@@ -2194,8 +2191,7 @@
   return TLI.getLibcallName(LC) != nullptr;
 }
 
-/// useDivRem - Only issue divrem libcall if both quotient and remainder are
-/// needed.
+/// Only issue divrem libcall if both quotient and remainder are needed.
 static bool useDivRem(SDNode *Node, bool isSigned, bool isDIV) {
   // The other use might have been replaced with a divrem already.
   unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
@@ -2220,8 +2216,7 @@
   return false;
 }
 
-/// ExpandDivRemLibCall - Issue libcalls to __{u}divmod to compute div / rem
-/// pairs.
+/// Issue libcalls to __{u}divmod to compute div / rem pairs.
 void
 SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
                                           SmallVectorImpl<SDValue> &Results) {
@@ -2283,7 +2278,7 @@
   Results.push_back(Rem);
 }
 
-/// isSinCosLibcallAvailable - Return true if sincos libcall is available.
+/// Return true if sincos libcall is available.
 static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
   RTLIB::Libcall LC;
   switch (Node->getSimpleValueType(0).SimpleTy) {
@@ -2297,8 +2292,8 @@
   return TLI.getLibcallName(LC) != nullptr;
 }
 
-/// canCombineSinCosLibcall - Return true if sincos libcall is available and
-/// can be used to combine sin and cos.
+/// Return true if sincos libcall is available and can be used to combine sin
+/// and cos.
 static bool canCombineSinCosLibcall(SDNode *Node, const TargetLowering &TLI,
                                     const TargetMachine &TM) {
   if (!isSinCosLibcallAvailable(Node, TLI))
@@ -2311,8 +2306,7 @@
   return true;
 }
 
-/// useSinCos - Only issue sincos libcall if both sin and cos are
-/// needed.
+/// Only issue sincos libcall if both sin and cos are needed.
 static bool useSinCos(SDNode *Node) {
   unsigned OtherOpcode = Node->getOpcode() == ISD::FSIN
     ? ISD::FCOS : ISD::FSIN;
@@ -2330,8 +2324,7 @@
   return false;
 }
 
-/// ExpandSinCosLibCall - Issue libcalls to sincos to compute sin / cos
-/// pairs.
+/// Issue libcalls to sincos to compute sin / cos pairs.
 void
 SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
                                           SmallVectorImpl<SDValue> &Results) {
@@ -2396,7 +2389,7 @@
                                 MachinePointerInfo(), false, false, false, 0));
 }
 
-/// ExpandLegalINT_TO_FP - This function is responsible for legalizing a
+/// This function is responsible for legalizing a
 /// INT_TO_FP operation of the specified operand when the target requests that
 /// we expand it.  At this point, we know that the result and operand types are
 /// legal for the target.
@@ -2594,7 +2587,7 @@
   return DAG.getNode(ISD::FADD, dl, DestVT, Tmp1, FudgeInReg);
 }
 
-/// PromoteLegalINT_TO_FP - This function is responsible for legalizing a
+/// This function is responsible for legalizing a
 /// *INT_TO_FP operation of the specified operand when the target requests that
 /// we promote it.  At this point, we know that the result and operand types are
 /// legal for the target, and that there is a legal UINT_TO_FP or SINT_TO_FP
@@ -2636,7 +2629,7 @@
                                  dl, NewInTy, LegalOp));
 }
 
-/// PromoteLegalFP_TO_INT - This function is responsible for legalizing a
+/// This function is responsible for legalizing a
 /// FP_TO_*INT operation of the specified operand when the target requests that
 /// we promote it.  At this point, we know that the result and operand types are
 /// legal for the target, and that there is a legal FP_TO_UINT or FP_TO_SINT
@@ -2680,8 +2673,7 @@
   return DAG.getNode(ISD::TRUNCATE, dl, DestVT, Operation);
 }
 
-/// ExpandBSWAP - Open code the operations for BSWAP of the specified operation.
-///
+/// Open code the operations for BSWAP of the specified operation.
 SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, SDLoc dl) {
   EVT VT = Op.getValueType();
   EVT SHVT = TLI.getShiftAmountTy(VT);
@@ -2727,8 +2719,7 @@
   }
 }
 
-/// ExpandBitCount - Expand the specified bitcount instruction into operations.
-///
+/// Expand the specified bitcount instruction into operations.
 SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
                                              SDLoc dl) {
   switch (Opc) {
@@ -3528,6 +3519,9 @@
                                       RTLIB::FMA_F80, RTLIB::FMA_F128,
                                       RTLIB::FMA_PPCF128));
     break;
+  case ISD::FMAD:
+    llvm_unreachable("Illegal fmad should never be formed");
+
   case ISD::FADD:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::ADD_F32, RTLIB::ADD_F64,
                                       RTLIB::ADD_F80, RTLIB::ADD_F128,
@@ -3554,6 +3548,21 @@
     break;
   }
   case ISD::FP_TO_FP16: {
+    if (!TM.Options.UseSoftFloat && TM.Options.UnsafeFPMath) {
+      SDValue Op = Node->getOperand(0);
+      MVT SVT = Op.getSimpleValueType();
+      if ((SVT == MVT::f64 || SVT == MVT::f80) &&
+          TLI.isOperationLegalOrCustom(ISD::FP_TO_FP16, MVT::f32)) {
+        // Under fastmath, we can expand this node into a fround followed by
+        // a float-half conversion.
+        SDValue FloatVal = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Op,
+                                       DAG.getIntPtrConstant(0));
+        Results.push_back(
+            DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, FloatVal));
+        break;
+      }
+    }
+
     RTLIB::Libcall LC =
         RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::f16);
     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to expand fp_to_fp16");
@@ -4319,8 +4328,7 @@
     ReplaceNode(Node, Results.data());
 }
 
-// SelectionDAG::Legalize - This is the entry point for the file.
-//
+/// This is the entry point for the file.
 void SelectionDAG::Legalize() {
   AssignTopologicalOrder();
 

diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 4591e79..b596715 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp

@@ -658,7 +658,7 @@
                            NVT, N->getOperand(0));
   return TLI.makeLibCall(DAG, LC,
                          TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
-                         &Op, 1, false, dl).first;
+                         &Op, 1, Signed, dl).first;
 }
 
 

diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index b73bb0a..5507c70 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp

@@ -66,6 +66,7 @@
   case ISD::EXTRACT_VECTOR_ELT:
                          Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break;
   case ISD::LOAD:        Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N));break;
+  case ISD::MLOAD:       Res = PromoteIntRes_MLOAD(cast<MaskedLoadSDNode>(N));break;
   case ISD::SELECT:      Res = PromoteIntRes_SELECT(N); break;
   case ISD::VSELECT:     Res = PromoteIntRes_VSELECT(N); break;
   case ISD::SELECT_CC:   Res = PromoteIntRes_SELECT_CC(N); break;
@@ -454,6 +455,24 @@
   return Res;
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue ExtSrc0 = GetPromotedInteger(N->getSrc0());
+
+  SDValue Mask = N->getMask();
+  EVT NewMaskVT = getSetCCResultType(NVT);
+  if (NewMaskVT != N->getMask().getValueType())
+    Mask = PromoteTargetBoolean(Mask, NewMaskVT);
+  SDLoc dl(N);
+
+  SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(),
+                                  Mask, ExtSrc0, N->getMemoryVT(),
+                                  N->getMemOperand(), ISD::SEXTLOAD);
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
 /// Promote the overflow flag of an overflowing arithmetic node.
 SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
   // Simply change the return type of the boolean result.
@@ -825,6 +844,10 @@
   case ISD::SINT_TO_FP:   Res = PromoteIntOp_SINT_TO_FP(N); break;
   case ISD::STORE:        Res = PromoteIntOp_STORE(cast<StoreSDNode>(N),
                                                    OpNo); break;
+  case ISD::MSTORE:       Res = PromoteIntOp_MSTORE(cast<MaskedStoreSDNode>(N),
+                                                    OpNo); break;
+  case ISD::MLOAD:        Res = PromoteIntOp_MLOAD(cast<MaskedLoadSDNode>(N),
+                                                    OpNo); break;
   case ISD::TRUNCATE:     Res = PromoteIntOp_TRUNCATE(N); break;
   case ISD::FP16_TO_FP:
   case ISD::UINT_TO_FP:   Res = PromoteIntOp_UINT_TO_FP(N); break;
@@ -1091,6 +1114,64 @@
                            N->getMemoryVT(), N->getMemOperand());
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo){
+
+  assert(OpNo == 2 && "Only know how to promote the mask!");
+  SDValue DataOp = N->getValue();
+  EVT DataVT = DataOp.getValueType();
+  SDValue Mask = N->getMask();
+  EVT MaskVT = Mask.getValueType();
+  SDLoc dl(N);
+
+  bool TruncateStore = false;
+  if (!TLI.isTypeLegal(DataVT)) {
+    if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger) {
+      DataOp = GetPromotedInteger(DataOp);
+      Mask = PromoteTargetBoolean(Mask, DataOp.getValueType());
+      TruncateStore = true;
+    }
+    else {
+      assert(getTypeAction(DataVT) == TargetLowering::TypeWidenVector &&
+             "Unexpected data legalization in MSTORE");
+      DataOp = GetWidenedVector(DataOp);
+
+      if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
+        Mask = GetWidenedVector(Mask);
+      else {
+        EVT BoolVT = getSetCCResultType(DataOp.getValueType());
+
+        // We can't use ModifyToType() because we should fill the mask with
+        // zeroes
+        unsigned WidenNumElts = BoolVT.getVectorNumElements();
+        unsigned MaskNumElts = MaskVT.getVectorNumElements();
+
+        unsigned NumConcat = WidenNumElts / MaskNumElts;
+        SmallVector<SDValue, 16> Ops(NumConcat);
+        SDValue ZeroVal = DAG.getConstant(0, MaskVT);
+        Ops[0] = Mask;
+        for (unsigned i = 1; i != NumConcat; ++i)
+          Ops[i] = ZeroVal;
+
+        Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
+      }
+    }
+  }
+  else
+    Mask = PromoteTargetBoolean(N->getMask(), DataOp.getValueType());
+  return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask,
+                            N->getMemoryVT(), N->getMemOperand(),
+                            TruncateStore);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo){
+  assert(OpNo == 2 && "Only know how to promote the mask!");
+  EVT DataVT = N->getValueType(0);
+  SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
+  SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
+  NewOps[OpNo] = Mask;
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), Op);
@@ -2936,17 +3017,13 @@
   EVT VT = N->getValueType(0);
   SDLoc dl(N);
 
-  unsigned NumElts = VT.getVectorNumElements();
-  SmallVector<int, 8> NewMask;
-  for (unsigned i = 0; i != NumElts; ++i) {
-    NewMask.push_back(SV->getMaskElt(i));
-  }
+  ArrayRef<int> NewMask = SV->getMask().slice(0, VT.getVectorNumElements());
 
   SDValue V0 = GetPromotedInteger(N->getOperand(0));
   SDValue V1 = GetPromotedInteger(N->getOperand(1));
   EVT OutVT = V0.getValueType();
 
-  return DAG.getVectorShuffle(OutVT, dl, V0, V1, &NewMask[0]);
+  return DAG.getVectorShuffle(OutVT, dl, V0, V1, NewMask);
 }
 
 

diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index bd7dacf..ebf6b28 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp

@@ -921,6 +921,17 @@
     // The target didn't want to custom lower it after all.
     return false;
 
+  // When called from DAGTypeLegalizer::ExpandIntegerResult, we might need to
+  // provide the same kind of custom splitting behavior.
+  if (Results.size() == N->getNumValues() + 1 && LegalizeResult) {
+    // We've legalized a return type by splitting it. If there is a chain,
+    // replace that too.
+    SetExpandedInteger(SDValue(N, 0), Results[0], Results[1]);
+    if (N->getNumValues() > 1)
+      ReplaceValueWith(SDValue(N, 1), Results[2]);
+    return true;
+  }
+
   // Make everything that once used N's values now use those in Results instead.
   assert(Results.size() == N->getNumValues() &&
          "Custom lowering returned the wrong number of results!");

diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 30f412b..cef3fc9 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h

@@ -240,6 +240,7 @@
   SDValue PromoteIntRes_FP_TO_FP16(SDNode *N);
   SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
   SDValue PromoteIntRes_LOAD(LoadSDNode *N);
+  SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N);
   SDValue PromoteIntRes_Overflow(SDNode *N);
   SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo);
   SDValue PromoteIntRes_SDIV(SDNode *N);
@@ -285,6 +286,8 @@
   SDValue PromoteIntOp_TRUNCATE(SDNode *N);
   SDValue PromoteIntOp_UINT_TO_FP(SDNode *N);
   SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo);
 
   void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
 
@@ -578,6 +581,7 @@
   void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_MLOAD(MaskedLoadSDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -594,6 +598,7 @@
   SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
   SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N);
   SDValue SplitVecOp_TRUNCATE(SDNode *N);
   SDValue SplitVecOp_VSETCC(SDNode *N);
@@ -627,6 +632,7 @@
   SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N);
   SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
   SDValue WidenVecRes_LOAD(SDNode* N);
+  SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N);
   SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N);
   SDValue WidenVecRes_SIGN_EXTEND_INREG(SDNode* N);
   SDValue WidenVecRes_SELECT(SDNode* N);
@@ -653,6 +659,7 @@
   SDValue WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue WidenVecOp_STORE(SDNode* N);
+  SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo);
   SDValue WidenVecOp_SETCC(SDNode* N);
 
   SDValue WidenVecOp_Convert(SDNode *N);

diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index b5af7b7..03c2734 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp

@@ -200,12 +200,15 @@
     LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
     ISD::LoadExtType ExtType = LD->getExtensionType();
     if (LD->getMemoryVT().isVector() && ExtType != ISD::NON_EXTLOAD)
-      switch (TLI.getLoadExtAction(LD->getExtensionType(), LD->getMemoryVT())) {
+      switch (TLI.getLoadExtAction(LD->getExtensionType(), LD->getValueType(0),
+                                   LD->getMemoryVT())) {
       default: llvm_unreachable("This action is not supported yet!");
       case TargetLowering::Legal:
         return TranslateLegalizeResults(Op, Result);
       case TargetLowering::Custom:
         if (SDValue Lowered = TLI.LowerOperation(Result, DAG)) {
+          if (Lowered == Result)
+            return TranslateLegalizeResults(Op, Lowered);
           Changed = true;
           if (Lowered->getNumValues() != Op->getNumValues()) {
             // This expanded to something other than the load. Assume the
@@ -231,9 +234,11 @@
       default: llvm_unreachable("This action is not supported yet!");
       case TargetLowering::Legal:
         return TranslateLegalizeResults(Op, Result);
-      case TargetLowering::Custom:
-        Changed = true;
-        return TranslateLegalizeResults(Op, TLI.LowerOperation(Result, DAG));
+      case TargetLowering::Custom: {
+        SDValue Lowered = TLI.LowerOperation(Result, DAG);
+        Changed = Lowered != Result;
+        return TranslateLegalizeResults(Op, Lowered);
+      }
       case TargetLowering::Expand:
         Changed = true;
         return LegalizeOp(ExpandStore(Op));
@@ -389,7 +394,8 @@
       if (Op.getOperand(j)
               .getValueType()
               .getVectorElementType()
-              .isFloatingPoint())
+              .isFloatingPoint() &&
+          NVT.isVector() && NVT.getVectorElementType().isFloatingPoint())
         Operands[j] = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op.getOperand(j));
       else
         Operands[j] = DAG.getNode(ISD::BITCAST, dl, NVT, Op.getOperand(j));
@@ -398,8 +404,9 @@
   }
 
   Op = DAG.getNode(Op.getOpcode(), dl, NVT, Operands);
-  if (VT.isFloatingPoint() ||
-      (VT.isVector() && VT.getVectorElementType().isFloatingPoint()))
+  if ((VT.isFloatingPoint() && NVT.isFloatingPoint()) ||
+      (VT.isVector() && VT.getVectorElementType().isFloatingPoint() &&
+       NVT.isVector() && NVT.getVectorElementType().isFloatingPoint()))
     return DAG.getNode(ISD::FP_ROUND, dl, VT, Op, DAG.getIntPtrConstant(0));
   else
     return DAG.getNode(ISD::BITCAST, dl, VT, Op);
@@ -509,7 +516,8 @@
         ScalarLoad = DAG.getLoad(WideVT, dl, Chain, BasePTR,
                                  LD->getPointerInfo().getWithOffset(Offset),
                                  LD->isVolatile(), LD->isNonTemporal(),
-                                 LD->isInvariant(), LD->getAlignment(),
+                                 LD->isInvariant(),
+                                 MinAlign(LD->getAlignment(), Offset),
                                  LD->getAAInfo());
       } else {
         EVT LoadVT = WideVT;
@@ -521,7 +529,8 @@
                                     LD->getPointerInfo().getWithOffset(Offset),
                                     LoadVT, LD->isVolatile(),
                                     LD->isNonTemporal(), LD->isInvariant(),
-                                    LD->getAlignment(), LD->getAAInfo());
+                                    MinAlign(LD->getAlignment(), Offset),
+                                    LD->getAAInfo());
       }
 
       RemainingBytes -= LoadBytes;
@@ -553,9 +562,9 @@
       BitOffset += SrcEltBits;
       if (BitOffset >= WideBits) {
         WideIdx++;
-        Offset -= WideBits;
-        if (Offset > 0) {
-          ShAmt = DAG.getConstant(SrcEltBits - Offset,
+        BitOffset -= WideBits;
+        if (BitOffset > 0) {
+          ShAmt = DAG.getConstant(SrcEltBits - BitOffset,
                                   TLI.getShiftAmountTy(WideVT));
           Hi = DAG.getNode(ISD::SHL, dl, WideVT, LoadVals[WideIdx], ShAmt);
           Hi = DAG.getNode(ISD::AND, dl, WideVT, Hi, SrcEltBitMask);
@@ -592,7 +601,7 @@
                 Chain, BasePTR, LD->getPointerInfo().getWithOffset(Idx * Stride),
                 SrcVT.getScalarType(),
                 LD->isVolatile(), LD->isNonTemporal(), LD->isInvariant(),
-                LD->getAlignment(), LD->getAAInfo());
+                MinAlign(LD->getAlignment(), Idx * Stride), LD->getAAInfo());
 
       BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
                          DAG.getConstant(Stride, BasePTR.getValueType()));
@@ -651,7 +660,8 @@
     // This scalar TruncStore may be illegal, but we legalize it later.
     SDValue Store = DAG.getTruncStore(Chain, dl, Ex, BasePTR,
                ST->getPointerInfo().getWithOffset(Idx*Stride), MemSclVT,
-               isVolatile, isNonTemporal, Alignment, AAInfo);
+               isVolatile, isNonTemporal, MinAlign(Alignment, Idx*Stride),
+               AAInfo);
 
     BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
                                DAG.getConstant(Stride, BasePTR.getValueType()));

diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 27f63d2..63671f7 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp

@@ -597,6 +597,9 @@
   case ISD::LOAD:
     SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
     break;
+  case ISD::MLOAD:
+    SplitVecRes_MLOAD(cast<MaskedLoadSDNode>(N), Lo, Hi);
+    break;
   case ISD::SETCC:
     SplitVecRes_SETCC(N, Lo, Hi);
     break;
@@ -979,6 +982,67 @@
   ReplaceValueWith(SDValue(LD, 1), Ch);
 }
 
+void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT LoVT, HiVT;
+  SDLoc dl(MLD);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
+
+  SDValue Ch = MLD->getChain();
+  SDValue Ptr = MLD->getBasePtr();
+  SDValue Mask = MLD->getMask();
+  unsigned Alignment = MLD->getOriginalAlignment();
+  ISD::LoadExtType ExtType = MLD->getExtensionType();
+
+  // if Alignment is equal to the vector size,
+  // take the half of it for the second part
+  unsigned SecondHalfAlignment =
+    (Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
+     Alignment/2 : Alignment;
+
+  SDValue MaskLo, MaskHi;
+  std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+
+  EVT MemoryVT = MLD->getMemoryVT();
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  SDValue Src0 = MLD->getSrc0();
+  SDValue Src0Lo, Src0Hi;
+  std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MLD->getPointerInfo(), 
+                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
+                         Alignment, MLD->getAAInfo(), MLD->getRanges());
+
+  Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, MaskLo, Src0Lo, LoMemVT, MMO,
+                         ExtType);
+
+  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
+
+  MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MLD->getPointerInfo(), 
+                         MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
+                         SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());
+
+  Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, Src0Hi, HiMemVT, MMO,
+                         ExtType);
+
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                   Hi.getValue(1));
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(MLD, 1), Ch);
+
+}
+
 void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
   assert(N->getValueType(0).isVector() &&
          N->getOperand(0).getValueType().isVector() &&
@@ -1234,6 +1298,9 @@
     case ISD::STORE:
       Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
       break;
+    case ISD::MSTORE:
+      Res = SplitVecOp_MSTORE(cast<MaskedStoreSDNode>(N), OpNo);
+      break;
     case ISD::VSELECT:
       Res = SplitVecOp_VSELECT(N, OpNo);
       break;
@@ -1395,6 +1462,58 @@
                         MachinePointerInfo(), EltVT, false, false, false, 0);
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
+                                            unsigned OpNo) {
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  SDValue Mask = N->getMask();
+  SDValue Data = N->getValue();
+  EVT MemoryVT = N->getMemoryVT();
+  unsigned Alignment = N->getOriginalAlignment();
+  SDLoc DL(N);
+  
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  SDValue DataLo, DataHi;
+  GetSplitVector(Data, DataLo, DataHi);
+  SDValue MaskLo, MaskHi;
+  GetSplitVector(Mask, MaskLo, MaskHi);
+
+  // if Alignment is equal to the vector size,
+  // take the half of it for the second part
+  unsigned SecondHalfAlignment =
+    (Alignment == Data->getValueType(0).getSizeInBits()/8) ?
+       Alignment/2 : Alignment;
+
+  SDValue Lo, Hi;
+  MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(N->getPointerInfo(), 
+                         MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
+                         Alignment, N->getAAInfo(), N->getRanges());
+
+  Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
+                          N->isTruncatingStore());
+
+  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+  Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                    DAG.getConstant(IncrementSize, Ptr.getValueType()));
+
+  MMO = DAG.getMachineFunction().
+    getMachineMemOperand(N->getPointerInfo(), 
+                         MachineMemOperand::MOStore,  HiMemVT.getStoreSize(),
+                         SecondHalfAlignment, N->getAAInfo(), N->getRanges());
+
+  Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
+                          N->isTruncatingStore());
+
+
+  // Build a factor node to remember that this store is independent of the
+  // other one.
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+
+}
+
 SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
   assert(N->isUnindexed() && "Indexed store of vector?");
   assert(OpNo == 1 && "Can only split the stored value");
@@ -1599,6 +1718,9 @@
   case ISD::VECTOR_SHUFFLE:
     Res = WidenVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N));
     break;
+  case ISD::MLOAD:
+    Res = WidenVecRes_MLOAD(cast<MaskedLoadSDNode>(N));
+    break;
 
   case ISD::ADD:
   case ISD::AND:
@@ -2289,6 +2411,44 @@
   return Result;
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
+  
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0));
+  SDValue Mask = N->getMask();
+  EVT MaskVT = Mask.getValueType();
+  SDValue Src0 = GetWidenedVector(N->getSrc0());
+  ISD::LoadExtType ExtType = N->getExtensionType();
+  SDLoc dl(N);
+
+  if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
+    Mask = GetWidenedVector(Mask);
+  else {
+    EVT BoolVT = getSetCCResultType(WidenVT);
+
+    // We can't use ModifyToType() because we should fill the mask with
+    // zeroes
+    unsigned WidenNumElts = BoolVT.getVectorNumElements();
+    unsigned MaskNumElts = MaskVT.getVectorNumElements();
+
+    unsigned NumConcat = WidenNumElts / MaskNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue ZeroVal = DAG.getConstant(0, MaskVT);
+    Ops[0] = Mask;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = ZeroVal;
+
+    Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
+  }
+
+  SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(),
+                                  Mask, Src0, N->getMemoryVT(),
+                                  N->getMemOperand(), ExtType);
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N),
@@ -2434,6 +2594,7 @@
   case ISD::EXTRACT_SUBVECTOR:  Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break;
   case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break;
   case ISD::STORE:              Res = WidenVecOp_STORE(N); break;
+  case ISD::MSTORE:             Res = WidenVecOp_MSTORE(N, OpNo); break;
   case ISD::SETCC:              Res = WidenVecOp_SETCC(N); break;
 
   case ISD::ANY_EXTEND:
@@ -2632,6 +2793,42 @@
     return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain);
 }
 
+SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) {
+  MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
+  SDValue Mask = MST->getMask();
+  EVT MaskVT = Mask.getValueType();
+  SDValue StVal = MST->getValue();
+  // Widen the value
+  SDValue WideVal = GetWidenedVector(StVal);
+  SDLoc dl(N);
+
+  if (OpNo == 2 || getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
+    Mask = GetWidenedVector(Mask);
+  else {
+    // The mask should be widened as well
+    EVT BoolVT = getSetCCResultType(WideVal.getValueType());
+    // We can't use ModifyToType() because we should fill the mask with
+    // zeroes
+    unsigned WidenNumElts = BoolVT.getVectorNumElements();
+    unsigned MaskNumElts = MaskVT.getVectorNumElements();
+
+    unsigned NumConcat = WidenNumElts / MaskNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue ZeroVal = DAG.getConstant(0, MaskVT);
+    Ops[0] = Mask;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = ZeroVal;
+
+    Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
+  }
+  assert(Mask.getValueType().getVectorNumElements() ==
+         WideVal.getValueType().getVectorNumElements() &&
+         "Mask and data vectors should have the same number of elements");
+  return DAG.getMaskedStore(MST->getChain(), dl, WideVal, MST->getBasePtr(),
+                            Mask, MST->getMemoryVT(), MST->getMemOperand(),
+                            false);
+}
+
 SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
   SDValue InOp0 = GetWidenedVector(N->getOperand(0));
   SDValue InOp1 = GetWidenedVector(N->getOperand(1));

diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 8b9f618..3853ada 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp

@@ -137,13 +137,9 @@
 }
 
 // Helper for AddGlue to clone node operands.
-static void CloneNodeWithValues(SDNode *N, SelectionDAG *DAG,
-                                SmallVectorImpl<EVT> &VTs,
+static void CloneNodeWithValues(SDNode *N, SelectionDAG *DAG, ArrayRef<EVT> VTs,
                                 SDValue ExtraOper = SDValue()) {
-  SmallVector<SDValue, 8> Ops;
-  for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I)
-    Ops.push_back(N->getOperand(I));
-
+  SmallVector<SDValue, 8> Ops(N->op_begin(), N->op_end());
   if (ExtraOper.getNode())
     Ops.push_back(ExtraOper);
 
@@ -165,7 +161,6 @@
 }
 
 static bool AddGlue(SDNode *N, SDValue Glue, bool AddGlue, SelectionDAG *DAG) {
-  SmallVector<EVT, 4> VTs;
   SDNode *GlueDestNode = Glue.getNode();
 
   // Don't add glue from a node to itself.
@@ -179,9 +174,7 @@
   // Don't add glue to something that already has a glue value.
   if (N->getValueType(N->getNumValues() - 1) == MVT::Glue) return false;
 
-  for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
-    VTs.push_back(N->getValueType(I));
-
+  SmallVector<EVT, 4> VTs(N->value_begin(), N->value_end());
   if (AddGlue)
     VTs.push_back(MVT::Glue);
 
@@ -197,11 +190,8 @@
           !N->hasAnyUseOfValue(N->getNumValues() - 1)) &&
          "expected an unused glue value");
 
-  SmallVector<EVT, 4> VTs;
-  for (unsigned I = 0, E = N->getNumValues()-1; I != E; ++I)
-    VTs.push_back(N->getValueType(I));
-
-  CloneNodeWithValues(N, DAG, VTs);
+  CloneNodeWithValues(N, DAG,
+                      makeArrayRef(N->value_begin(), N->getNumValues() - 1));
 }
 
 /// ClusterNeighboringLoads - Force nearby loads together by "gluing" them.
@@ -551,6 +541,14 @@
     NodeNumDefs = 0;
     return;
   }
+  if (POpc == TargetOpcode::PATCHPOINT &&
+      Node->getValueType(0) == MVT::Other) {
+    // PATCHPOINT is defined to have one result, but it might really have none
+    // if we're not using CallingConv::AnyReg. Don't mistake the chain for a
+    // real definition.
+    NodeNumDefs = 0;
+    return;
+  }
   unsigned NRegDefs = SchedDAG->TII->get(Node->getMachineOpcode()).getNumDefs();
   // Some instructions define regs that are not represented in the selection DAG
   // (e.g. unused flags). See tMOVi8. Make sure we don't access past NumValues.

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 7961e66..9466f4d 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

@@ -234,10 +234,10 @@
   return true;
 }
 
-ISD::NodeType ISD::getExtForLoadExtType(ISD::LoadExtType ExtType) {
+ISD::NodeType ISD::getExtForLoadExtType(bool IsFP, ISD::LoadExtType ExtType) {
   switch (ExtType) {
   case ISD::EXTLOAD:
-    return ISD::ANY_EXTEND;
+    return IsFP ? ISD::FP_EXTEND : ISD::ANY_EXTEND;
   case ISD::SEXTLOAD:
     return ISD::SIGN_EXTEND;
   case ISD::ZEXTLOAD:
@@ -1484,6 +1484,34 @@
   if (N1.getOpcode() == ISD::UNDEF)
     commuteShuffle(N1, N2, MaskVec);
 
+  // If shuffling a splat, try to blend the splat instead. We do this here so
+  // that even when this arises during lowering we don't have to re-handle it.
+  auto BlendSplat = [&](BuildVectorSDNode *BV, int Offset) {
+    BitVector UndefElements;
+    SDValue Splat = BV->getSplatValue(&UndefElements);
+    if (!Splat)
+      return;
+
+    for (int i = 0; i < (int)NElts; ++i) {
+      if (MaskVec[i] < Offset || MaskVec[i] >= (Offset + (int)NElts))
+        continue;
+
+      // If this input comes from undef, mark it as such.
+      if (UndefElements[MaskVec[i] - Offset]) {
+        MaskVec[i] = -1;
+        continue;
+      }
+
+      // If we can blend a non-undef lane, use that instead.
+      if (!UndefElements[i])
+        MaskVec[i] = i + Offset;
+    }
+  };
+  if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
+    BlendSplat(N1BV, 0);
+  if (auto *N2BV = dyn_cast<BuildVectorSDNode>(N2))
+    BlendSplat(N2BV, NElts);
+
   // Canonicalize all index into lhs, -> shuffle lhs, undef
   // Canonicalize all index into rhs, -> shuffle rhs, undef
   bool AllLHS = true, AllRHS = true;
@@ -1513,9 +1541,10 @@
     return getUNDEF(VT);
 
   // If Identity shuffle return that node.
-  bool Identity = true;
+  bool Identity = true, AllSame = true;
   for (unsigned i = 0; i != NElts; ++i) {
     if (MaskVec[i] >= 0 && MaskVec[i] != (int)i) Identity = false;
+    if (MaskVec[i] != MaskVec[0]) AllSame = false;
   }
   if (Identity && NElts)
     return N1;
@@ -1537,18 +1566,35 @@
       if (Splat && Splat.getOpcode() == ISD::UNDEF)
         return getUNDEF(VT);
 
+      bool SameNumElts =
+          V.getValueType().getVectorNumElements() == VT.getVectorNumElements();
+
       // We only have a splat which can skip shuffles if there is a splatted
       // value and no undef lanes rearranged by the shuffle.
       if (Splat && UndefElements.none()) {
         // Splat of <x, x, ..., x>, return <x, x, ..., x>, provided that the
         // number of elements match or the value splatted is a zero constant.
-        if (V.getValueType().getVectorNumElements() ==
-            VT.getVectorNumElements())
+        if (SameNumElts)
           return N1;
         if (auto *C = dyn_cast<ConstantSDNode>(Splat))
           if (C->isNullValue())
             return N1;
       }
+
+      // If the shuffle itself creates a splat, build the vector directly.
+      if (AllSame && SameNumElts) {
+        const SDValue &Splatted = BV->getOperand(MaskVec[0]);
+        SmallVector<SDValue, 8> Ops(NElts, Splatted);
+
+        EVT BuildVT = BV->getValueType(0);
+        SDValue NewBV = getNode(ISD::BUILD_VECTOR, dl, BuildVT, Ops);
+
+        // We may have jumped through bitcasts, so the type of the
+        // BUILD_VECTOR may not match the type of the shuffle.
+        if (BuildVT != VT)
+          NewBV = getNode(ISD::BITCAST, dl, VT, NewBV);
+        return NewBV;
+      }
     }
   }
 
@@ -2323,6 +2369,21 @@
     KnownZero = APInt::getHighBitsSet(BitWidth, Leaders);
     break;
   }
+  case ISD::EXTRACT_ELEMENT: {
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    const unsigned Index =
+      cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    const unsigned BitWidth = Op.getValueType().getSizeInBits();
+
+    // Remove low part of known bits mask
+    KnownZero = KnownZero.getHiBits(KnownZero.getBitWidth() - Index * BitWidth);
+    KnownOne = KnownOne.getHiBits(KnownOne.getBitWidth() - Index * BitWidth);
+
+    // Remove high part of known bit mask
+    KnownZero = KnownZero.trunc(BitWidth);
+    KnownOne = KnownOne.trunc(BitWidth);
+    break;
+  }
   case ISD::FrameIndex:
   case ISD::TargetFrameIndex:
     if (unsigned Align = InferPtrAlignment(Op)) {
@@ -2522,6 +2583,21 @@
     // FIXME: it's tricky to do anything useful for this, but it is an important
     // case for targets like X86.
     break;
+  case ISD::EXTRACT_ELEMENT: {
+    const int KnownSign = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    const int BitWidth = Op.getValueType().getSizeInBits();
+    const int Items =
+      Op.getOperand(0).getValueType().getSizeInBits() / BitWidth;
+
+    // Get reverse index (starting from 1), Op1 value indexes elements from
+    // little end. Sign starts at big end.
+    const int rIndex = Items - 1 -
+      cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+    // If the sign portion ends in our element the substraction gives correct
+    // result. Otherwise it gives either negative or > bitwidth result
+    return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0);
+  }
   }
 
   // If we are looking at the loaded value of the SDNode.
@@ -2683,6 +2759,8 @@
       return getConstantFP(apf, VT);
     }
     case ISD::BITCAST:
+      if (VT == MVT::f16 && C->getValueType(0) == MVT::i16)
+        return getConstantFP(APFloat(APFloat::IEEEhalf, Val), VT);
       if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
         return getConstantFP(APFloat(APFloat::IEEEsingle, Val), VT);
       else if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
@@ -2756,7 +2834,9 @@
       return getConstant(api, VT);
     }
     case ISD::BITCAST:
-      if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
+      if (VT == MVT::i16 && C->getValueType(0) == MVT::f16)
+        return getConstant((uint16_t)V.bitcastToAPInt().getZExtValue(), VT);
+      else if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
         return getConstant((uint32_t)V.bitcastToAPInt().getZExtValue(), VT);
       else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
         return getConstant(V.bitcastToAPInt().getZExtValue(), VT);
@@ -3379,8 +3459,9 @@
   }
 
   // Perform trivial constant folding.
-  SDValue SV = FoldConstantArithmetic(Opcode, VT, N1.getNode(), N2.getNode());
-  if (SV.getNode()) return SV;
+  if (SDValue SV =
+          FoldConstantArithmetic(Opcode, VT, N1.getNode(), N2.getNode()))
+    return SV;
 
   // Canonicalize constant to RHS if commutative.
   if (N1C && !N2C && isCommutativeBinOp(Opcode)) {
@@ -3564,7 +3645,7 @@
       const APFloat &V3 = N3CFP->getValueAPF();
       APFloat::opStatus s =
         V1.fusedMultiplyAdd(V2, V3, APFloat::rmNearestTiesToEven);
-      if (s != APFloat::opInvalidOp)
+      if (!TLI->hasFloatingPointExceptions() || s != APFloat::opInvalidOp)
         return getConstantFP(V1, VT);
     }
     break;
@@ -3913,9 +3994,7 @@
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  bool OptSize =
-    MF.getFunction()->getAttributes().
-      hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+  bool OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
@@ -4028,8 +4107,7 @@
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  bool OptSize = MF.getFunction()->getAttributes().
-    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+  bool OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
@@ -4123,8 +4201,7 @@
   bool DstAlignCanChange = false;
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  bool OptSize = MF.getFunction()->getAttributes().
-    hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
+  bool OptSize = MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize);
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
@@ -4214,11 +4291,13 @@
 
   // Then check to see if we should lower the memcpy with target-specific
   // code. If the target chooses to do this, this is the next best.
-  SDValue Result =
-      TSI->EmitTargetCodeForMemcpy(*this, dl, Chain, Dst, Src, Size, Align,
-                                   isVol, AlwaysInline, DstPtrInfo, SrcPtrInfo);
-  if (Result.getNode())
-    return Result;
+  if (TSI) {
+    SDValue Result = TSI->EmitTargetCodeForMemcpy(
+        *this, dl, Chain, Dst, Src, Size, Align, isVol, AlwaysInline,
+        DstPtrInfo, SrcPtrInfo);
+    if (Result.getNode())
+      return Result;
+  }
 
   // If we really need inline code and the target declined to provide it,
   // use a (potentially long) sequence of loads and stores.
@@ -4280,10 +4359,12 @@
 
   // Then check to see if we should lower the memmove with target-specific
   // code. If the target chooses to do this, this is the next best.
-  SDValue Result = TSI->EmitTargetCodeForMemmove(
-      *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo, SrcPtrInfo);
-  if (Result.getNode())
-    return Result;
+  if (TSI) {
+    SDValue Result = TSI->EmitTargetCodeForMemmove(
+        *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo, SrcPtrInfo);
+    if (Result.getNode())
+      return Result;
+  }
 
   // FIXME: If the memmove is volatile, lowering it to plain libc memmove may
   // not be safe.  See memcpy above for more details.
@@ -4332,10 +4413,12 @@
 
   // Then check to see if we should lower the memset with target-specific
   // code. If the target chooses to do this, this is the next best.
-  SDValue Result = TSI->EmitTargetCodeForMemset(*this, dl, Chain, Dst, Src,
-                                                Size, Align, isVol, DstPtrInfo);
-  if (Result.getNode())
-    return Result;
+  if (TSI) {
+    SDValue Result = TSI->EmitTargetCodeForMemset(
+        *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo);
+    if (Result.getNode())
+      return Result;
+  }
 
   // Emit a library call.
   Type *IntPtrTy = TLI->getDataLayout()->getIntPtrType(*getContext());
@@ -4680,10 +4763,10 @@
     assert(VT.isInteger() == MemVT.isInteger() &&
            "Cannot convert from FP to Int or Int -> FP!");
     assert(VT.isVector() == MemVT.isVector() &&
-           "Cannot use trunc store to convert to or from a vector!");
+           "Cannot use an ext load to convert to or from a vector!");
     assert((!VT.isVector() ||
             VT.getVectorNumElements() == MemVT.getVectorNumElements()) &&
-           "Cannot use trunc store to change the number of vector elements!");
+           "Cannot use an ext load to change the number of vector elements!");
   }
 
   bool Indexed = AM != ISD::UNINDEXED;
@@ -4917,6 +5000,61 @@
   return SDValue(N, 0);
 }
 
+SDValue
+SelectionDAG::getMaskedLoad(EVT VT, SDLoc dl, SDValue Chain,
+                            SDValue Ptr, SDValue Mask, SDValue Src0, EVT MemVT,
+                            MachineMemOperand *MMO, ISD::LoadExtType ExtTy) {
+
+  SDVTList VTs = getVTList(VT, MVT::Other);
+  SDValue Ops[] = { Chain, Ptr, Mask, Src0 };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(ExtTy, ISD::UNINDEXED,
+                                     MMO->isVolatile(),
+                                     MMO->isNonTemporal(),
+                                     MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<MaskedLoadSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) MaskedLoadSDNode(dl.getIROrder(),
+                                             dl.getDebugLoc(), Ops, 4, VTs,
+                                             ExtTy, MemVT, MMO);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getMaskedStore(SDValue Chain, SDLoc dl, SDValue Val,
+                                     SDValue Ptr, SDValue Mask, EVT MemVT,
+                                     MachineMemOperand *MMO, bool isTrunc) {
+  assert(Chain.getValueType() == MVT::Other &&
+        "Invalid chain type");
+  EVT VT = Val.getValueType();
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Ops[] = { Chain, Ptr, Mask, Val };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(),
+                                     MMO->isNonTemporal(), MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
+    cast<MaskedStoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  SDNode *N = new (NodeAllocator) MaskedStoreSDNode(dl.getIROrder(),
+                                                    dl.getDebugLoc(), Ops, 4,
+                                                    VTs, isTrunc, MemVT, MMO);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
 SDValue SelectionDAG::getVAArg(EVT VT, SDLoc dl,
                                SDValue Chain, SDValue Ptr,
                                SDValue SV,
@@ -6495,11 +6633,25 @@
     return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
   }
 
-  // Handle X+C
-  if (isBaseWithConstantOffset(Loc) && Loc.getOperand(0) == BaseLoc &&
-      cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue() == Dist*Bytes)
-    return true;
-
+  // Handle X + C.
+  if (isBaseWithConstantOffset(Loc)) {
+    int64_t LocOffset = cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();
+    if (Loc.getOperand(0) == BaseLoc) {
+      // If the base location is a simple address with no offset itself, then
+      // the second load's first add operand should be the base address.
+      if (LocOffset == Dist * (int)Bytes)
+        return true;
+    } else if (isBaseWithConstantOffset(BaseLoc)) {
+      // The base location itself has an offset, so subtract that value from the
+      // second load's offset before comparing to distance * size.
+      int64_t BOffset =
+        cast<ConstantSDNode>(BaseLoc.getOperand(1))->getSExtValue();
+      if (Loc.getOperand(0) == BaseLoc.getOperand(0)) {
+        if ((LocOffset - BOffset) == Dist * (int)Bytes)
+          return true;
+      }
+    }
+  }
   const GlobalValue *GV1 = nullptr;
   const GlobalValue *GV2 = nullptr;
   int64_t Offset1 = 0;

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 8f582f1..097b618 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

@@ -16,9 +16,11 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
@@ -46,6 +48,8 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -54,7 +58,6 @@
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetSelectionDAGInfo.h"
@@ -564,6 +567,7 @@
   } else if (NumParts > 0) {
     // If the intermediate type was expanded, split each the value into
     // legal parts.
+    assert(NumIntermediates != 0 && "division by zero");
     assert(NumParts % NumIntermediates == 0 &&
            "Must expand into a divisible number of parts!");
     unsigned Factor = NumParts / NumIntermediates;
@@ -865,7 +869,7 @@
   AA = &aa;
   GFI = gfi;
   LibInfo = li;
-  DL = DAG.getSubtarget().getDataLayout();
+  DL = DAG.getTarget().getDataLayout();
   Context = DAG.getContext();
   LPadToCallSiteMap.clear();
 }
@@ -884,6 +888,7 @@
   CurInst = nullptr;
   HasTailCall = false;
   SDNodeOrder = LowestSDNodeOrder;
+  StatepointLowering.clear();
 }
 
 /// clearDanglingDebugInfo - Clear the dangling debug information
@@ -1234,24 +1239,29 @@
     unsigned NumValues = ValueVTs.size();
     if (NumValues) {
       SDValue RetOp = getValue(I.getOperand(0));
-      for (unsigned j = 0, f = NumValues; j != f; ++j) {
+
+      const Function *F = I.getParent()->getParent();
+
+      ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+      if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+                                          Attribute::SExt))
+        ExtendKind = ISD::SIGN_EXTEND;
+      else if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+                                               Attribute::ZExt))
+        ExtendKind = ISD::ZERO_EXTEND;
+
+      LLVMContext &Context = F->getContext();
+      bool RetInReg = F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+                                                      Attribute::InReg);
+
+      for (unsigned j = 0; j != NumValues; ++j) {
         EVT VT = ValueVTs[j];
 
-        ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
-
-        const Function *F = I.getParent()->getParent();
-        if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
-                                            Attribute::SExt))
-          ExtendKind = ISD::SIGN_EXTEND;
-        else if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
-                                                 Attribute::ZExt))
-          ExtendKind = ISD::ZERO_EXTEND;
-
         if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
-          VT = TLI.getTypeForExtArgOrReturn(*DAG.getContext(), VT, ExtendKind);
+          VT = TLI.getTypeForExtArgOrReturn(Context, VT, ExtendKind);
 
-        unsigned NumParts = TLI.getNumRegisters(*DAG.getContext(), VT);
-        MVT PartVT = TLI.getRegisterType(*DAG.getContext(), VT);
+        unsigned NumParts = TLI.getNumRegisters(Context, VT);
+        MVT PartVT = TLI.getRegisterType(Context, VT);
         SmallVector<SDValue, 4> Parts(NumParts);
         getCopyToParts(DAG, getCurSDLoc(),
                        SDValue(RetOp.getNode(), RetOp.getResNo() + j),
@@ -1259,8 +1269,7 @@
 
         // 'inreg' on function refers to return value
         ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
-        if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
-                                            Attribute::InReg))
+        if (RetInReg)
           Flags.setInReg();
 
         // Propagate extension type if any
@@ -1405,7 +1414,7 @@
         if (TM.Options.NoNaNsFPMath)
           Condition = getFCmpCodeWithoutNaN(Condition);
       } else {
-        Condition = ISD::SETEQ; // silence warning.
+        (void)Condition; // silence warning.
         llvm_unreachable("Unknown compare instruction");
       }
 
@@ -1947,7 +1956,7 @@
   SDValue ShiftOp = DAG.getCopyFromReg(getControlRoot(), getCurSDLoc(),
                                        Reg, VT);
   SDValue Cmp;
-  unsigned PopCount = CountPopulation_64(B.Mask);
+  unsigned PopCount = countPopulation(B.Mask);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (PopCount == 1) {
     // Testing for a single bit; just compare the shift count with what it
@@ -1959,7 +1968,7 @@
     // There is only one zero bit in the range, test for it directly.
     Cmp = DAG.getSetCC(
         getCurSDLoc(), TLI.getSetCCResultType(*DAG.getContext(), VT), ShiftOp,
-        DAG.getConstant(CountTrailingOnes_64(B.Mask), VT), ISD::SETNE);
+        DAG.getConstant(countTrailingOnes(B.Mask), VT), ISD::SETNE);
   } else {
     // Make desired shift
     SDValue SwitchVal = DAG.getNode(ISD::SHL, getCurSDLoc(), VT,
@@ -2062,10 +2071,14 @@
   // Get the two live-in registers as SDValues. The physregs have already been
   // copied into virtual registers.
   SDValue Ops[2];
-  Ops[0] = DAG.getZExtOrTrunc(
-      DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
-                         FuncInfo.ExceptionPointerVirtReg, TLI.getPointerTy()),
-      getCurSDLoc(), ValueVTs[0]);
+  if (FuncInfo.ExceptionPointerVirtReg) {
+    Ops[0] = DAG.getZExtOrTrunc(
+        DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
+                           FuncInfo.ExceptionPointerVirtReg, TLI.getPointerTy()),
+        getCurSDLoc(), ValueVTs[0]);
+  } else {
+    Ops[0] = DAG.getConstant(0, TLI.getPointerTy());
+  }
   Ops[1] = DAG.getZExtOrTrunc(
       DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
                          FuncInfo.ExceptionSelectorVirtReg, TLI.getPointerTy()),
@@ -2077,6 +2090,27 @@
   setValue(&LP, Res);
 }
 
+unsigned
+SelectionDAGBuilder::visitLandingPadClauseBB(GlobalValue *ClauseGV,
+                                             MachineBasicBlock *LPadBB) {
+  SDValue Chain = getControlRoot();
+
+  // Get the typeid that we will dispatch on later.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy());
+  unsigned VReg = FuncInfo.MF->getRegInfo().createVirtualRegister(RC);
+  unsigned TypeID = DAG.getMachineFunction().getMMI().getTypeIDFor(ClauseGV);
+  SDValue Sel = DAG.getConstant(TypeID, TLI.getPointerTy());
+  Chain = DAG.getCopyToReg(Chain, getCurSDLoc(), VReg, Sel);
+
+  // Branch to the main landing pad block.
+  MachineBasicBlock *ClauseMBB = FuncInfo.MBB;
+  ClauseMBB->addSuccessor(LPadBB);
+  DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other, Chain,
+                          DAG.getBasicBlock(LPadBB)));
+  return VReg;
+}
+
 /// handleSmallSwitchCaseRange - Emit a series of specific tests (suitable for
 /// small case ranges).
 bool SelectionDAGBuilder::handleSmallSwitchRange(CaseRec& CR,
@@ -2363,17 +2397,8 @@
                                                   CaseRecVector& WorkList,
                                                   const Value* SV,
                                                   MachineBasicBlock* SwitchBB) {
-  // Get the MachineFunction which holds the current MBB.  This is used when
-  // inserting any additional MBBs necessary to represent the switch.
-  MachineFunction *CurMF = FuncInfo.MF;
-
-  // Figure out which block is immediately after the current one.
-  MachineFunction::iterator BBI = CR.CaseBB;
-  ++BBI;
-
   Case& FrontCase = *CR.Range.first;
   Case& BackCase  = *(CR.Range.second-1);
-  const BasicBlock *LLVMBB = CR.CaseBB->getBasicBlock();
 
   // Size is the number of Cases represented by this range.
   unsigned Size = CR.Range.second - CR.Range.first;
@@ -2395,6 +2420,7 @@
   DEBUG(dbgs() << "Selecting best pivot: \n"
                << "First: " << First << ", Last: " << Last <<'\n'
                << "LSize: " << LSize << ", RSize: " << RSize << '\n');
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   for (CaseItr I = CR.Range.first, J=I+1, E = CR.Range.second;
        J!=E; ++I, ++J) {
     const APInt &LEnd = cast<ConstantInt>(I->High)->getValue();
@@ -2404,13 +2430,17 @@
            "Invalid case distance");
     // Use volatile double here to avoid excess precision issues on some hosts,
     // e.g. that use 80-bit X87 registers.
+    // Only consider the density of sub-ranges that actually have sufficient
+    // entries to be lowered as a jump table.
     volatile double LDensity =
-       (double)LSize.roundToDouble() /
-                           (LEnd - First + 1ULL).roundToDouble();
+        LSize.ult(TLI.getMinimumJumpTableEntries())
+            ? 0.0
+            : LSize.roundToDouble() / (LEnd - First + 1ULL).roundToDouble();
     volatile double RDensity =
-      (double)RSize.roundToDouble() /
-                           (Last - RBegin + 1ULL).roundToDouble();
-    volatile double Metric = Range.logBase2()*(LDensity+RDensity);
+        RSize.ult(TLI.getMinimumJumpTableEntries())
+            ? 0.0
+            : RSize.roundToDouble() / (Last - RBegin + 1ULL).roundToDouble();
+    volatile double Metric = Range.logBase2() * (LDensity + RDensity);
     // Should always split in some non-trivial place
     DEBUG(dbgs() <<"=>Step\n"
                  << "LEnd: " << LEnd << ", RBegin: " << RBegin << '\n'
@@ -2427,13 +2457,25 @@
     RSize -= J->size();
   }
 
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (areJTsAllowed(TLI)) {
-    // If our case is dense we *really* should handle it earlier!
-    assert((FMetric > 0) && "Should handle dense range earlier!");
-  } else {
+  if (FMetric == 0 || !areJTsAllowed(TLI))
     Pivot = CR.Range.first + Size/2;
-  }
+  splitSwitchCase(CR, Pivot, WorkList, SV, SwitchBB);
+  return true;
+}
+
+void SelectionDAGBuilder::splitSwitchCase(CaseRec &CR, CaseItr Pivot,
+                                          CaseRecVector &WorkList,
+                                          const Value *SV,
+                                          MachineBasicBlock *SwitchBB) {
+  // Get the MachineFunction which holds the current MBB.  This is used when
+  // inserting any additional MBBs necessary to represent the switch.
+  MachineFunction *CurMF = FuncInfo.MF;
+
+  // Figure out which block is immediately after the current one.
+  MachineFunction::iterator BBI = CR.CaseBB;
+  ++BBI;
+
+  const BasicBlock *LLVMBB = CR.CaseBB->getBasicBlock();
 
   CaseRange LHSR(CR.Range.first, Pivot);
   CaseRange RHSR(Pivot, CR.Range.second);
@@ -2446,10 +2488,9 @@
   // LHS's Case Value, and that Case Value is exactly one less than the
   // Pivot's Value, then we can branch directly to the LHS's Target,
   // rather than creating a leaf node for it.
-  if ((LHSR.second - LHSR.first) == 1 &&
-      LHSR.first->High == CR.GE &&
+  if ((LHSR.second - LHSR.first) == 1 && LHSR.first->High == CR.GE &&
       cast<ConstantInt>(C)->getValue() ==
-      (cast<ConstantInt>(CR.GE)->getValue() + 1LL)) {
+          (cast<ConstantInt>(CR.GE)->getValue() + 1LL)) {
     TrueBB = LHSR.first->BB;
   } else {
     TrueBB = CurMF->CreateMachineBasicBlock(LLVMBB);
@@ -2466,12 +2507,12 @@
   // the current Case Value, rather than emitting a RHS leaf node for it.
   if ((RHSR.second - RHSR.first) == 1 && CR.LT &&
       cast<ConstantInt>(RHSR.first->Low)->getValue() ==
-      (cast<ConstantInt>(CR.LT)->getValue() - 1LL)) {
+          (cast<ConstantInt>(CR.LT)->getValue() - 1LL)) {
     FalseBB = RHSR.first->BB;
   } else {
     FalseBB = CurMF->CreateMachineBasicBlock(LLVMBB);
     CurMF->insert(BBI, FalseBB);
-    WorkList.push_back(CaseRec(FalseBB,CR.LT,C,RHSR));
+    WorkList.push_back(CaseRec(FalseBB, CR.LT, C, RHSR));
 
     // Put SV in a virtual register to make it available from the new blocks.
     ExportFromCurrentBlock(SV);
@@ -2486,8 +2527,6 @@
     visitSwitchCase(CB, SwitchBB);
   else
     SwitchCases.push_back(CB);
-
-  return true;
 }
 
 /// handleBitTestsSwitchCase - if current case range has few destination and
@@ -2514,15 +2553,14 @@
     return false;
 
   size_t numCmps = 0;
-  for (CaseItr I = CR.Range.first, E = CR.Range.second;
-       I!=E; ++I) {
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I) {
     // Single case counts one, case range - two.
     numCmps += (I->Low == I->High ? 1 : 2);
   }
 
   // Count unique destinations
   SmallSet<MachineBasicBlock*, 4> Dests;
-  for (CaseItr I = CR.Range.first, E = CR.Range.second; I!=E; ++I) {
+  for (CaseItr I = CR.Range.first, E = CR.Range.second; I != E; ++I) {
     Dests.insert(I->BB);
     if (Dests.size() > 3)
       // Don't bother the code below, if there are too much unique destinations
@@ -2629,9 +2667,8 @@
 void SelectionDAGBuilder::Clusterify(CaseVector& Cases,
                                      const SwitchInst& SI) {
   BranchProbabilityInfo *BPI = FuncInfo.BPI;
-  // Start with "simple" cases
-  for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
-       i != e; ++i) {
+  // Start with "simple" cases.
+  for (SwitchInst::ConstCaseIt i : SI.cases()) {
     const BasicBlock *SuccBB = i.getCaseSuccessor();
     MachineBasicBlock *SMBB = FuncInfo.MBBMap[SuccBB];
 
@@ -2694,32 +2731,58 @@
 
   // Figure out which block is immediately after the current one.
   MachineBasicBlock *NextBlock = nullptr;
-  MachineBasicBlock *Default = FuncInfo.MBBMap[SI.getDefaultDest()];
+  if (SwitchMBB + 1 != FuncInfo.MF->end())
+    NextBlock = SwitchMBB + 1;
 
-  // If there is only the default destination, branch to it if it is not the
-  // next basic block.  Otherwise, just fall through.
-  if (!SI.getNumCases()) {
-    // Update machine-CFG edges.
 
-    // If this is not a fall-through branch, emit the branch.
-    SwitchMBB->addSuccessor(Default);
-    if (Default != NextBlock)
-      DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
-                              MVT::Other, getControlRoot(),
-                              DAG.getBasicBlock(Default)));
-
-    return;
-  }
-
-  // If there are any non-default case statements, create a vector of Cases
-  // representing each one, and sort the vector so that we can efficiently
-  // create a binary search tree from them.
+  // Create a vector of Cases, sorted so that we can efficiently create a binary
+  // search tree from them.
   CaseVector Cases;
   Clusterify(Cases, SI);
 
-  // Get the Value to be switched on and default basic blocks, which will be
-  // inserted into CaseBlock records, representing basic blocks in the binary
-  // search tree.
+  // Get the default destination MBB.
+  MachineBasicBlock *Default = FuncInfo.MBBMap[SI.getDefaultDest()];
+
+  if (isa<UnreachableInst>(SI.getDefaultDest()->getFirstNonPHIOrDbg()) &&
+      !Cases.empty()) {
+    // Replace an unreachable default destination with the most popular case
+    // destination.
+    DenseMap<const BasicBlock *, unsigned> Popularity;
+    unsigned MaxPop = 0;
+    const BasicBlock *MaxBB = nullptr;
+    for (auto I : SI.cases()) {
+      const BasicBlock *BB = I.getCaseSuccessor();
+      if (++Popularity[BB] > MaxPop) {
+        MaxPop = Popularity[BB];
+        MaxBB = BB;
+      }
+    }
+
+    // Set new default.
+    assert(MaxPop > 0);
+    assert(MaxBB);
+    Default = FuncInfo.MBBMap[MaxBB];
+
+    // Remove cases that were pointing to the destination that is now the default.
+    Cases.erase(std::remove_if(Cases.begin(), Cases.end(),
+                               [&](const Case &C) { return C.BB == Default; }),
+                Cases.end());
+  }
+
+  // If there is only the default destination, go there directly.
+  if (Cases.empty()) {
+    // Update machine-CFG edges.
+    SwitchMBB->addSuccessor(Default);
+
+    // If this is not a fall-through branch, emit the branch.
+    if (Default != NextBlock) {
+      DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other,
+                              getControlRoot(), DAG.getBasicBlock(Default)));
+    }
+    return;
+  }
+
+  // Get the Value to be switched on.
   const Value *SV = SI.getCondition();
 
   // Push the initial CaseRec onto the worklist
@@ -3613,6 +3676,74 @@
   DAG.setRoot(StoreNode);
 }
 
+void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) {
+  SDLoc sdl = getCurSDLoc();
+
+  // llvm.masked.store.*(Src0, Ptr, alignemt, Mask)
+  Value  *PtrOperand = I.getArgOperand(1);
+  SDValue Ptr = getValue(PtrOperand);
+  SDValue Src0 = getValue(I.getArgOperand(0));
+  SDValue Mask = getValue(I.getArgOperand(3));
+  EVT VT = Src0.getValueType();
+  unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(2)))->getZExtValue();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlignment(VT);
+
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
+
+  MachineMemOperand *MMO =
+    DAG.getMachineFunction().
+    getMachineMemOperand(MachinePointerInfo(PtrOperand),
+                          MachineMemOperand::MOStore,  VT.getStoreSize(),
+                          Alignment, AAInfo);
+  SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, VT,
+                                         MMO, false);
+  DAG.setRoot(StoreNode);
+  setValue(&I, StoreNode);
+}
+
+void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I) {
+  SDLoc sdl = getCurSDLoc();
+
+  // @llvm.masked.load.*(Ptr, alignment, Mask, Src0)
+  Value  *PtrOperand = I.getArgOperand(0);
+  SDValue Ptr = getValue(PtrOperand);
+  SDValue Src0 = getValue(I.getArgOperand(3));
+  SDValue Mask = getValue(I.getArgOperand(2));
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(I.getType());
+  unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(1)))->getZExtValue();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlignment(VT);
+
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
+  const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
+
+  SDValue InChain = DAG.getRoot();
+  if (AA->pointsToConstantMemory(
+      AliasAnalysis::Location(PtrOperand,
+                              AA->getTypeStoreSize(I.getType()),
+                              AAInfo))) {
+    // Do not serialize (non-volatile) loads of constant memory with anything.
+    InChain = DAG.getEntryNode();
+  }
+
+  MachineMemOperand *MMO =
+    DAG.getMachineFunction().
+    getMachineMemOperand(MachinePointerInfo(PtrOperand),
+                          MachineMemOperand::MOLoad,  VT.getStoreSize(),
+                          Alignment, AAInfo, Ranges);
+
+  SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, VT, MMO,
+                                   ISD::NON_EXTLOAD);
+  SDValue OutChain = Load.getValue(1);
+  DAG.setRoot(OutChain);
+  setValue(&I, Load);
+}
+
 void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
   SDLoc dl = getCurSDLoc();
   AtomicOrdering SuccessOrder = I.getSuccessOrdering();
@@ -4460,11 +4591,10 @@
       return DAG.getConstantFP(1.0, LHS.getValueType());
 
     const Function *F = DAG.getMachineFunction().getFunction();
-    if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                         Attribute::OptimizeForSize) ||
+    if (!F->hasFnAttribute(Attribute::OptimizeForSize) ||
         // If optimizing for size, don't insert too many multiplies.  This
         // inserts up to 5 multiplies.
-        CountPopulation_32(Val)+Log2_32(Val) < 7) {
+        countPopulation(Val) + Log2_32(Val) < 7) {
       // We use the simple binary decomposition method to generate the multiply
       // sequence.  There are more optimal ways to do this (for example,
       // powi(x,15) generates one more multiply than it should), but this has
@@ -4623,7 +4753,8 @@
     return nullptr;
   case Intrinsic::read_register: {
     Value *Reg = I.getArgOperand(0);
-    SDValue RegName = DAG.getMDNode(cast<MDNode>(Reg));
+    SDValue RegName =
+        DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
     EVT VT = TLI.getValueType(I.getType());
     setValue(&I, DAG.getNode(ISD::READ_REGISTER, sdl, VT, RegName));
     return nullptr;
@@ -4632,7 +4763,8 @@
     Value *Reg = I.getArgOperand(0);
     Value *RegValue = I.getArgOperand(1);
     SDValue Chain = getValue(RegValue).getOperand(0);
-    SDValue RegName = DAG.getMDNode(cast<MDNode>(Reg));
+    SDValue RegName =
+        DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
     DAG.setRoot(DAG.getNode(ISD::WRITE_REGISTER, sdl, MVT::Other, Chain,
                             RegName, getValue(RegValue)));
     return nullptr;
@@ -4642,6 +4774,7 @@
   case Intrinsic::longjmp:
     return &"_longjmp"[!TLI.usesUnderscoreLongJmp()];
   case Intrinsic::memcpy: {
+    // FIXME: this definition of "user defined address space" is x86-specific
     // Assert for address < 256 since we support only user defined address
     // spaces.
     assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
@@ -4662,6 +4795,7 @@
     return nullptr;
   }
   case Intrinsic::memset: {
+    // FIXME: this definition of "user defined address space" is x86-specific
     // Assert for address < 256 since we support only user defined address
     // spaces.
     assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
@@ -4679,6 +4813,7 @@
     return nullptr;
   }
   case Intrinsic::memmove: {
+    // FIXME: this definition of "user defined address space" is x86-specific
     // Assert for address < 256 since we support only user defined address
     // spaces.
     assert(cast<PointerType>(I.getArgOperand(0)->getType())->getAddressSpace()
@@ -4914,6 +5049,12 @@
     return nullptr;
   }
 
+  case Intrinsic::masked_load:
+    visitMaskedLoad(I);
+    return nullptr;
+  case Intrinsic::masked_store:
+    visitMaskedStore(I);
+    return nullptr;
   case Intrinsic::x86_mmx_pslli_w:
   case Intrinsic::x86_mmx_pslli_d:
   case Intrinsic::x86_mmx_pslli_q:
@@ -5459,6 +5600,78 @@
     visitPatchpoint(&I);
     return nullptr;
   }
+  case Intrinsic::experimental_gc_statepoint: {
+    visitStatepoint(I);
+    return nullptr;
+  }
+  case Intrinsic::experimental_gc_result_int:
+  case Intrinsic::experimental_gc_result_float:
+  case Intrinsic::experimental_gc_result_ptr:
+  case Intrinsic::experimental_gc_result: {
+    visitGCResult(I);
+    return nullptr;
+  }
+  case Intrinsic::experimental_gc_relocate: {
+    visitGCRelocate(I);
+    return nullptr;
+  }
+  case Intrinsic::instrprof_increment:
+    llvm_unreachable("instrprof failed to lower an increment");
+
+  case Intrinsic::frameallocate: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
+
+    // Do the allocation and map it as a normal value.
+    // FIXME: Maybe we should add this to the alloca map so that we don't have
+    // to register allocate it?
+    uint64_t Size = cast<ConstantInt>(I.getArgOperand(0))->getZExtValue();
+    int Alloc = MF.getFrameInfo()->CreateFrameAllocation(Size);
+    MVT PtrVT = TLI.getPointerTy(0);
+    SDValue FIVal = DAG.getFrameIndex(Alloc, PtrVT);
+    setValue(&I, FIVal);
+
+    // Directly emit a FRAME_ALLOC machine instr. Label assignment emission is
+    // the same on all targets.
+    MCSymbol *FrameAllocSym =
+        MF.getMMI().getContext().getOrCreateFrameAllocSymbol(MF.getName());
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl,
+            TII->get(TargetOpcode::FRAME_ALLOC))
+        .addSym(FrameAllocSym)
+        .addFrameIndex(Alloc);
+
+    return nullptr;
+  }
+
+  case Intrinsic::framerecover: {
+    // i8* @llvm.framerecover(i8* %fn, i8* %fp)
+    MachineFunction &MF = DAG.getMachineFunction();
+    MVT PtrVT = TLI.getPointerTy(0);
+
+    // Get the symbol that defines the frame offset.
+    Function *Fn = cast<Function>(I.getArgOperand(0)->stripPointerCasts());
+    MCSymbol *FrameAllocSym =
+        MF.getMMI().getContext().getOrCreateFrameAllocSymbol(Fn->getName());
+
+    // Create a TargetExternalSymbol for the label to avoid any target lowering
+    // that would make this PC relative.
+    StringRef Name = FrameAllocSym->getName();
+    assert(Name.size() == strlen(Name.data()) && "not null terminated");
+    SDValue OffsetSym = DAG.getTargetExternalSymbol(Name.data(), PtrVT);
+    SDValue OffsetVal =
+        DAG.getNode(ISD::FRAME_ALLOC_RECOVER, sdl, PtrVT, OffsetSym);
+
+    // Add the offset to the FP.
+    Value *FP = I.getArgOperand(1);
+    SDValue FPVal = getValue(FP);
+    SDValue Add = DAG.getNode(ISD::ADD, sdl, PtrVT, FPVal, OffsetVal);
+    setValue(&I, Add);
+
+    return nullptr;
+  }
+  case Intrinsic::eh_begincatch:
+  case Intrinsic::eh_endcatch:
+    llvm_unreachable("begin/end catch intrinsics not lowered in codegen");
   }
 }
 
@@ -5491,9 +5704,8 @@
 
     CLI.setChain(getRoot());
   }
-
-  const TargetLowering *TLI = TM.getSubtargetImpl()->getTargetLowering();
-  std::pair<SDValue, SDValue> Result = TLI->LowerCallTo(CLI);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
 
   assert((CLI.IsTailCall || Result.second.getNode()) &&
          "Non-null chain expected with non-tail call!");
@@ -6191,9 +6403,10 @@
 
   // If this is a constraint for a single physreg, or a constraint for a
   // register class, find it.
-  std::pair<unsigned, const TargetRegisterClass*> PhysReg =
-    TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
-                                     OpInfo.ConstraintVT);
+  std::pair<unsigned, const TargetRegisterClass *> PhysReg =
+      TLI.getRegForInlineAsmConstraint(MF.getSubtarget().getRegisterInfo(),
+                                       OpInfo.ConstraintCode,
+                                       OpInfo.ConstraintVT);
 
   unsigned NumRegs = 1;
   if (OpInfo.ConstraintVT != MVT::Other) {
@@ -6289,8 +6502,8 @@
   SDISelAsmOperandInfoVector ConstraintOperands;
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  TargetLowering::AsmOperandInfoVector
-    TargetConstraints = TLI.ParseConstraints(CS);
+  TargetLowering::AsmOperandInfoVector TargetConstraints =
+      TLI.ParseConstraints(DAG.getSubtarget().getRegisterInfo(), CS);
 
   bool hasMemory = false;
 
@@ -6382,12 +6595,13 @@
       SDISelAsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
 
       if (OpInfo.ConstraintVT != Input.ConstraintVT) {
-        std::pair<unsigned, const TargetRegisterClass*> MatchRC =
-          TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
-                                            OpInfo.ConstraintVT);
-        std::pair<unsigned, const TargetRegisterClass*> InputRC =
-          TLI.getRegForInlineAsmConstraint(Input.ConstraintCode,
-                                            Input.ConstraintVT);
+	const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
+        std::pair<unsigned, const TargetRegisterClass *> MatchRC =
+            TLI.getRegForInlineAsmConstraint(TRI, OpInfo.ConstraintCode,
+                                             OpInfo.ConstraintVT);
+        std::pair<unsigned, const TargetRegisterClass *> InputRC =
+            TLI.getRegForInlineAsmConstraint(TRI, Input.ConstraintCode,
+                                             Input.ConstraintVT);
         if ((OpInfo.ConstraintVT.isInteger() !=
              Input.ConstraintVT.isInteger()) ||
             (MatchRC.second != InputRC.second)) {
@@ -6848,7 +7062,8 @@
 SelectionDAGBuilder::lowerCallOperands(ImmutableCallSite CS, unsigned ArgIdx,
                                        unsigned NumArgs, SDValue Callee,
                                        bool UseVoidTy,
-                                       MachineBasicBlock *LandingPad) {
+                                       MachineBasicBlock *LandingPad,
+                                       bool IsPatchPoint) {
   TargetLowering::ArgListTy Args;
   Args.reserve(NumArgs);
 
@@ -6871,7 +7086,7 @@
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(getCurSDLoc()).setChain(getRoot())
     .setCallee(CS.getCallingConv(), retTy, Callee, std::move(Args), NumArgs)
-    .setDiscardResult(CS->use_empty());
+    .setDiscardResult(CS->use_empty()).setIsPatchPoint(IsPatchPoint);
 
   return lowerInvokable(CLI, LandingPad);
 }
@@ -7003,7 +7218,7 @@
   unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
   std::pair<SDValue, SDValue> Result =
     lowerCallOperands(CS, NumMetaOpers, NumCallArgs, Callee, IsAnyRegCC,
-                      LandingPad);
+                      LandingPad, true);
 
   SDNode *CallEnd = Result.second.getNode();
   if (HasDef && (CallEnd->getOpcode() == ISD::CopyFromReg))
@@ -7051,8 +7266,7 @@
 
   // Push the arguments from the call instruction up to the register mask.
   SDNode::op_iterator e = HasGlue ? Call->op_end()-2 : Call->op_end()-1;
-  for (SDNode::op_iterator i = Call->op_begin()+2; i != e; ++i)
-    Ops.push_back(*i);
+  Ops.append(Call->op_begin() + 2, e);
 
   // Push live variables for the stack map.
   addStackMapLiveVars(CS, NumMetaOpers + NumArgs, Ops, *this);
@@ -7251,11 +7465,8 @@
       }
       if (Args[i].isNest)
         Flags.setNest();
-      if (NeedsRegBlock) {
+      if (NeedsRegBlock)
         Flags.setInConsecutiveRegs();
-        if (Value == NumValues - 1)
-          Flags.setInConsecutiveRegsLast();
-      }
       Flags.setOrigAlign(OriginalAlignment);
 
       MVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT);
@@ -7304,6 +7515,9 @@
         CLI.Outs.push_back(MyFlags);
         CLI.OutVals.push_back(Parts[j]);
       }
+
+      if (NeedsRegBlock && Value == NumValues - 1)
+        CLI.Outs[CLI.Outs.size() - 1].Flags.setInConsecutiveRegsLast();
     }
   }
 
@@ -7460,7 +7674,8 @@
     ISD::ArgFlagsTy Flags;
     Flags.setSRet();
     MVT RegisterVT = TLI->getRegisterType(*DAG.getContext(), ValueVTs[0]);
-    ISD::InputArg RetArg(Flags, RegisterVT, ValueVTs[0], true, 0, 0);
+    ISD::InputArg RetArg(Flags, RegisterVT, ValueVTs[0], true,
+                         ISD::InputArg::NoArgIndex, 0);
     Ins.push_back(RetArg);
   }
 
@@ -7518,11 +7733,8 @@
       }
       if (F.getAttributes().hasAttribute(Idx, Attribute::Nest))
         Flags.setNest();
-      if (NeedsRegBlock) {
+      if (NeedsRegBlock)
         Flags.setInConsecutiveRegs();
-        if (Value == NumValues - 1)
-          Flags.setInConsecutiveRegsLast();
-      }
       Flags.setOrigAlign(OriginalAlignment);
 
       MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
@@ -7537,6 +7749,8 @@
           MyFlags.Flags.setOrigAlign(1);
         Ins.push_back(MyFlags);
       }
+      if (NeedsRegBlock && Value == NumValues - 1)
+        Ins[Ins.size() - 1].Flags.setInConsecutiveRegsLast();
       PartBase += VT.getStoreSize();
     }
   }
@@ -7671,7 +7885,6 @@
   assert(i == InVals.size() && "Argument register count mismatch!");
 
   // Finally, if the target has anything special to do, allow it to do so.
-  // FIXME: this should insert code into the DAG!
   EmitFunctionEntryCode();
 }
 
@@ -7762,6 +7975,7 @@
 SelectionDAGBuilder::StackProtectorDescriptor::
 AddSuccessorMBB(const BasicBlock *BB,
                 MachineBasicBlock *ParentMBB,
+                bool IsLikely,
                 MachineBasicBlock *SuccMBB) {
   // If SuccBB has not been created yet, create it.
   if (!SuccMBB) {
@@ -7771,6 +7985,7 @@
     MF->insert(++BBI, SuccMBB);
   }
   // Add it as a successor of ParentMBB.
-  ParentMBB->addSuccessor(SuccMBB);
+  ParentMBB->addSuccessor(
+      SuccMBB, BranchProbabilityInfo::getBranchWeightStackProtector(IsLikely));
   return SuccMBB;
 }

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index f74e652..ad7411f 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h

@@ -14,11 +14,13 @@
 #ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_SELECTIONDAGBUILDER_H
 #define LLVM_LIB_CODEGEN_SELECTIONDAG_SELECTIONDAGBUILDER_H
 
+#include "StatepointLowering.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/IR/CallSite.h"
+#include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLowering.h"
@@ -115,6 +117,10 @@
   /// get simple disambiguation between loads without worrying about alias
   /// analysis.
   SmallVector<SDValue, 8> PendingLoads;
+
+  /// State used while lowering a statepoint sequence (gc_statepoint,
+  /// gc_relocate, and gc_result).  See StatepointLowering.hpp/cpp for details.
+  StatepointLoweringState StatepointLowering;
 private:
 
   /// PendingExports - CopyToReg nodes that copy values to virtual registers
@@ -417,8 +423,8 @@
       assert(!shouldEmitStackProtector() && "Stack Protector Descriptor is "
              "already initialized!");
       ParentMBB = MBB;
-      SuccessMBB = AddSuccessorMBB(BB, MBB);
-      FailureMBB = AddSuccessorMBB(BB, MBB, FailureMBB);
+      SuccessMBB = AddSuccessorMBB(BB, MBB, /* IsLikely */ true);
+      FailureMBB = AddSuccessorMBB(BB, MBB, /* IsLikely */ false, FailureMBB);
       if (!Guard)
         Guard = StackProtCheckCall.getArgOperand(0);
     }
@@ -487,9 +493,10 @@
 
     /// Add a successor machine basic block to ParentMBB. If the successor mbb
     /// has not been created yet (i.e. if SuccMBB = 0), then the machine basic
-    /// block will be created.
+    /// block will be created. Assign a large weight if IsLikely is true.
     MachineBasicBlock *AddSuccessorMBB(const BasicBlock *BB,
                                        MachineBasicBlock *ParentMBB,
+                                       bool IsLikely,
                                        MachineBasicBlock *SuccMBB = nullptr);
   };
 
@@ -612,6 +619,13 @@
     N = NewN;
   }
 
+  void removeValue(const Value *V) {
+    // This is to support hack in lowerCallFromStatepoint
+    // Should be removed when hack is resolved
+    if (NodeMap.count(V))
+      NodeMap.erase(V);
+  }
+
   void setUnusedArgValue(const Value *V, SDValue NewN) {
     SDValue &N = UnusedArgNodeMap[V];
     assert(!N.getNode() && "Already set a value for this node!");
@@ -640,12 +654,15 @@
           unsigned NumArgs,
           SDValue Callee,
           bool UseVoidTy = false,
-          MachineBasicBlock *LandingPad = nullptr);
+          MachineBasicBlock *LandingPad = nullptr,
+          bool IsPatchPoint = false);
 
   /// UpdateSplitBlock - When an MBB was split during scheduling, update the
   /// references that need to refer to the last resulting block.
   void UpdateSplitBlock(MachineBasicBlock *First, MachineBasicBlock *Last);
 
+  // This function is responsible for the whole statepoint lowering process.
+  void LowerStatepoint(ImmutableStatepoint Statepoint);
 private:
   std::pair<SDValue, SDValue> lowerInvokable(
           TargetLowering::CallLoweringInfo &CLI,
@@ -673,6 +690,8 @@
                                CaseRecVector& WorkList,
                                const Value* SV,
                                MachineBasicBlock *SwitchBB);
+  void splitSwitchCase(CaseRec &CR, CaseItr Pivot, CaseRecVector &WorkList,
+                       const Value *SV, MachineBasicBlock *SwitchBB);
   bool handleBitTestsSwitchCase(CaseRec& CR,
                                 CaseRecVector& WorkList,
                                 const Value* SV,
@@ -699,6 +718,8 @@
   void visitJumpTable(JumpTable &JT);
   void visitJumpTableHeader(JumpTable &JT, JumpTableHeader &JTH,
                             MachineBasicBlock *SwitchBB);
+  unsigned visitLandingPadClauseBB(GlobalValue *ClauseGV,
+                                   MachineBasicBlock *LPadMBB);
 
 private:
   // These all get lowered before this pass.
@@ -756,6 +777,8 @@
   void visitAlloca(const AllocaInst &I);
   void visitLoad(const LoadInst &I);
   void visitStore(const StoreInst &I);
+  void visitMaskedLoad(const CallInst &I);
+  void visitMaskedStore(const CallInst &I);
   void visitAtomicCmpXchg(const AtomicCmpXchgInst &I);
   void visitAtomicRMW(const AtomicRMWInst &I);
   void visitFence(const FenceInst &I);
@@ -784,6 +807,11 @@
   void visitPatchpoint(ImmutableCallSite CS,
                        MachineBasicBlock *LandingPad = nullptr);
 
+  // These three are implemented in StatepointLowering.cpp
+  void visitStatepoint(const CallInst &I);
+  void visitGCRelocate(const CallInst &I);
+  void visitGCResult(const CallInst &I);
+
   void visitUserOp1(const Instruction &I) {
     llvm_unreachable("UserOp1 should not exist at instruction selection time!");
   }

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index c9f6cff..17eff94 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp

@@ -187,6 +187,7 @@
   case ISD::FMUL:                       return "fmul";
   case ISD::FDIV:                       return "fdiv";
   case ISD::FMA:                        return "fma";
+  case ISD::FMAD:                       return "fmad";
   case ISD::FREM:                       return "frem";
   case ISD::FCOPYSIGN:                  return "fcopysign";
   case ISD::FGETSIGN:                   return "fgetsign";
@@ -269,6 +270,8 @@
     // Other operators
   case ISD::LOAD:                       return "load";
   case ISD::STORE:                      return "store";
+  case ISD::MLOAD:                      return "masked_load";
+  case ISD::MSTORE:                     return "masked_store";
   case ISD::VAARG:                      return "vaarg";
   case ISD::VACOPY:                     return "vacopy";
   case ISD::VAEND:                      return "vaend";

diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 79109b7..5e867cf 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp

@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/GCStrategy.h"
 #include "ScheduleDAGSDNodes.h"
 #include "SelectionDAGBuilder.h"
 #include "llvm/ADT/PostOrderIterator.h"
@@ -19,10 +19,11 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
-#include "llvm/CodeGen/GCStrategy.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -31,6 +32,7 @@
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Function.h"
@@ -40,6 +42,7 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -47,7 +50,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -181,6 +183,10 @@
         cl::init(true), cl::Hidden);
 
 #ifndef NDEBUG
+static cl::opt<std::string>
+FilterDAGBasicBlockName("filter-view-dags", cl::Hidden,
+                        cl::desc("Only display the basic block whose name "
+                                 "matches this for all view-*-dags options"));
 static cl::opt<bool>
 ViewDAGCombine1("view-dag-combine1-dags", cl::Hidden,
           cl::desc("Pop up a window to show dags before the first "
@@ -345,7 +351,8 @@
     initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
     initializeAliasAnalysisAnalysisGroup(*PassRegistry::getPassRegistry());
     initializeBranchProbabilityInfoPass(*PassRegistry::getPassRegistry());
-    initializeTargetLibraryInfoPass(*PassRegistry::getPassRegistry());
+    initializeTargetLibraryInfoWrapperPassPass(
+        *PassRegistry::getPassRegistry());
   }
 
 SelectionDAGISel::~SelectionDAGISel() {
@@ -359,7 +366,7 @@
   AU.addPreserved<AliasAnalysis>();
   AU.addRequired<GCModuleInfo>();
   AU.addPreserved<GCModuleInfo>();
-  AU.addRequired<TargetLibraryInfo>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
   if (UseMBPI && OptLevel != CodeGenOpt::None)
     AU.addRequired<BranchProbabilityInfo>();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -372,7 +379,7 @@
 ///
 /// This is required for correctness, so it must be done at -O0.
 ///
-static void SplitCriticalSideEffectEdges(Function &Fn, Pass *SDISel) {
+static void SplitCriticalSideEffectEdges(Function &Fn, AliasAnalysis *AA) {
   // Loop for blocks with phi nodes.
   for (Function::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB) {
     PHINode *PN = dyn_cast<PHINode>(BB->begin());
@@ -396,8 +403,9 @@
           continue;
 
         // Okay, we have to split this edge.
-        SplitCriticalEdge(Pred->getTerminator(),
-                          GetSuccessorNumber(Pred, BB), SDISel, true);
+        SplitCriticalEdge(
+            Pred->getTerminator(), GetSuccessorNumber(Pred, BB),
+            CriticalEdgeSplittingOptions(AA).setMergeIdenticalEdges());
         goto ReprocessBlock;
       }
   }
@@ -429,12 +437,12 @@
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
   AA = &getAnalysis<AliasAnalysis>();
-  LibInfo = &getAnalysis<TargetLibraryInfo>();
+  LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
 
   DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
-  SplitCriticalSideEffectEdges(const_cast<Function&>(Fn), this);
+  SplitCriticalSideEffectEdges(const_cast<Function&>(Fn), AA);
 
   CurDAG->init(*MF);
   FuncInfo->set(Fn, *MF, CurDAG);
@@ -650,6 +658,12 @@
   std::string BlockName;
   int BlockNumber = -1;
   (void)BlockNumber;
+  bool MatchFilterBB = false; (void)MatchFilterBB;
+#ifndef NDEBUG
+  MatchFilterBB = (FilterDAGBasicBlockName.empty() ||
+                   FilterDAGBasicBlockName ==
+                       FuncInfo->MBB->getBasicBlock()->getName().str());
+#endif
 #ifdef NDEBUG
   if (ViewDAGCombine1 || ViewLegalizeTypesDAGs || ViewLegalizeDAGs ||
       ViewDAGCombine2 || ViewDAGCombineLT || ViewISelDAGs || ViewSchedDAGs ||
@@ -663,7 +677,8 @@
   DEBUG(dbgs() << "Initial selection DAG: BB#" << BlockNumber
         << " '" << BlockName << "'\n"; CurDAG->dump());
 
-  if (ViewDAGCombine1) CurDAG->viewGraph("dag-combine1 input for " + BlockName);
+  if (ViewDAGCombine1 && MatchFilterBB)
+    CurDAG->viewGraph("dag-combine1 input for " + BlockName);
 
   // Run the DAG combiner in pre-legalize mode.
   {
@@ -676,8 +691,8 @@
 
   // Second step, hack on the DAG until it only uses operations and types that
   // the target supports.
-  if (ViewLegalizeTypesDAGs) CurDAG->viewGraph("legalize-types input for " +
-                                               BlockName);
+  if (ViewLegalizeTypesDAGs && MatchFilterBB)
+    CurDAG->viewGraph("legalize-types input for " + BlockName);
 
   bool Changed;
   {
@@ -691,7 +706,7 @@
   CurDAG->NewNodesMustHaveLegalTypes = true;
 
   if (Changed) {
-    if (ViewDAGCombineLT)
+    if (ViewDAGCombineLT && MatchFilterBB)
       CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
 
     // Run the DAG combiner in post-type-legalize mode.
@@ -717,7 +732,7 @@
       CurDAG->LegalizeTypes();
     }
 
-    if (ViewDAGCombineLT)
+    if (ViewDAGCombineLT && MatchFilterBB)
       CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
 
     // Run the DAG combiner in post-type-legalize mode.
@@ -731,7 +746,8 @@
           << BlockNumber << " '" << BlockName << "'\n"; CurDAG->dump());
   }
 
-  if (ViewLegalizeDAGs) CurDAG->viewGraph("legalize input for " + BlockName);
+  if (ViewLegalizeDAGs && MatchFilterBB)
+    CurDAG->viewGraph("legalize input for " + BlockName);
 
   {
     NamedRegionTimer T("DAG Legalization", GroupName, TimePassesIsEnabled);
@@ -741,7 +757,8 @@
   DEBUG(dbgs() << "Legalized selection DAG: BB#" << BlockNumber
         << " '" << BlockName << "'\n"; CurDAG->dump());
 
-  if (ViewDAGCombine2) CurDAG->viewGraph("dag-combine2 input for " + BlockName);
+  if (ViewDAGCombine2 && MatchFilterBB)
+    CurDAG->viewGraph("dag-combine2 input for " + BlockName);
 
   // Run the DAG combiner in post-legalize mode.
   {
@@ -755,7 +772,8 @@
   if (OptLevel != CodeGenOpt::None)
     ComputeLiveOutVRegInfo();
 
-  if (ViewISelDAGs) CurDAG->viewGraph("isel input for " + BlockName);
+  if (ViewISelDAGs && MatchFilterBB)
+    CurDAG->viewGraph("isel input for " + BlockName);
 
   // Third, instruction select all of the operations to machine code, adding the
   // code to the MachineBasicBlock.
@@ -767,7 +785,8 @@
   DEBUG(dbgs() << "Selected selection DAG: BB#" << BlockNumber
         << " '" << BlockName << "'\n"; CurDAG->dump());
 
-  if (ViewSchedDAGs) CurDAG->viewGraph("scheduler input for " + BlockName);
+  if (ViewSchedDAGs && MatchFilterBB)
+    CurDAG->viewGraph("scheduler input for " + BlockName);
 
   // Schedule machine code.
   ScheduleDAGSDNodes *Scheduler = CreateScheduler();
@@ -777,7 +796,7 @@
     Scheduler->Run(CurDAG, FuncInfo->MBB);
   }
 
-  if (ViewSUnitDAGs) Scheduler->viewGraph();
+  if (ViewSUnitDAGs && MatchFilterBB) Scheduler->viewGraph();
 
   // Emit machine code to BB.  This can change 'BB' to the last block being
   // inserted into.
@@ -892,6 +911,8 @@
 void SelectionDAGISel::PrepareEHLandingPad() {
   MachineBasicBlock *MBB = FuncInfo->MBB;
 
+  const TargetRegisterClass *PtrRC = TLI->getRegClassFor(TLI->getPointerTy());
+
   // Add a label to mark the beginning of the landing pad.  Deletion of the
   // landing pad can thus be detected via the MachineModuleInfo.
   MCSymbol *Label = MF->getMMI().addLandingPad(MBB);
@@ -903,8 +924,73 @@
   BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
     .addSym(Label);
 
+  // If this is an MSVC-style personality function, we need to split the landing
+  // pad into several BBs.
+  const BasicBlock *LLVMBB = MBB->getBasicBlock();
+  const LandingPadInst *LPadInst = LLVMBB->getLandingPadInst();
+  MF->getMMI().addPersonality(
+      MBB, cast<Function>(LPadInst->getPersonalityFn()->stripPointerCasts()));
+  if (MF->getMMI().getPersonalityType() == EHPersonality::MSVC_Win64SEH) {
+    // Make virtual registers and a series of labels that fill in values for the
+    // clauses.
+    auto &RI = MF->getRegInfo();
+    FuncInfo->ExceptionSelectorVirtReg = RI.createVirtualRegister(PtrRC);
+
+    // Get all invoke BBs that will unwind into the clause BBs.
+    SmallVector<MachineBasicBlock *, 4> InvokeBBs(MBB->pred_begin(),
+                                                  MBB->pred_end());
+
+    // Emit separate machine basic blocks with separate labels for each clause
+    // before the main landing pad block.
+    MachineInstrBuilder SelectorPHI = BuildMI(
+        *MBB, MBB->begin(), SDB->getCurDebugLoc(), TII->get(TargetOpcode::PHI),
+        FuncInfo->ExceptionSelectorVirtReg);
+    for (unsigned I = 0, E = LPadInst->getNumClauses(); I != E; ++I) {
+      // Skip filter clauses, we can't implement them yet.
+      if (LPadInst->isFilter(I))
+        continue;
+
+      MachineBasicBlock *ClauseBB = MF->CreateMachineBasicBlock(LLVMBB);
+      MF->insert(MBB, ClauseBB);
+
+      // Add the edge from the invoke to the clause.
+      for (MachineBasicBlock *InvokeBB : InvokeBBs)
+        InvokeBB->addSuccessor(ClauseBB);
+
+      // Mark the clause as a landing pad or MI passes will delete it.
+      ClauseBB->setIsLandingPad();
+
+      GlobalValue *ClauseGV = ExtractTypeInfo(LPadInst->getClause(I));
+
+      // Start the BB with a label.
+      MCSymbol *ClauseLabel = MF->getMMI().addClauseForLandingPad(MBB);
+      BuildMI(*ClauseBB, ClauseBB->begin(), SDB->getCurDebugLoc(), II)
+          .addSym(ClauseLabel);
+
+      // Construct a simple BB that defines a register with the typeid constant.
+      FuncInfo->MBB = ClauseBB;
+      FuncInfo->InsertPt = ClauseBB->end();
+      unsigned VReg = SDB->visitLandingPadClauseBB(ClauseGV, MBB);
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+
+      // Add the typeid virtual register to the phi in the main landing pad.
+      SelectorPHI.addReg(VReg).addMBB(ClauseBB);
+    }
+
+    // Remove the edge from the invoke to the lpad.
+    for (MachineBasicBlock *InvokeBB : InvokeBBs)
+      InvokeBB->removeSuccessor(MBB);
+
+    // Restore FuncInfo back to its previous state and select the main landing
+    // pad block.
+    FuncInfo->MBB = MBB;
+    FuncInfo->InsertPt = MBB->end();
+    return;
+  }
+
   // Mark exception register as live in.
-  const TargetRegisterClass *PtrRC = TLI->getRegClassFor(TLI->getPointerTy());
   if (unsigned Reg = TLI->getExceptionPointerRegister())
     FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC);
 

diff --git a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
new file mode 100644
index 0000000..1271f6b
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp

@@ -0,0 +1,679 @@
+//===-- StatepointLowering.cpp - SDAGBuilder's statepoint code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes support code use by SelectionDAGBuilder when lowering a
+// statepoint sequence in SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "StatepointLowering.h"
+#include "SelectionDAGBuilder.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/Target/TargetLowering.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "statepoint-lowering"
+
+STATISTIC(NumSlotsAllocatedForStatepoints,
+          "Number of stack slots allocated for statepoints");
+STATISTIC(NumOfStatepoints, "Number of statepoint nodes encountered");
+STATISTIC(StatepointMaxSlotsRequired,
+          "Maximum number of stack slots required for a singe statepoint");
+
+void
+StatepointLoweringState::startNewStatepoint(SelectionDAGBuilder &Builder) {
+  // Consistency check
+  assert(PendingGCRelocateCalls.empty() &&
+         "Trying to visit statepoint before finished processing previous one");
+  Locations.clear();
+  RelocLocations.clear();
+  NextSlotToAllocate = 0;
+  // Need to resize this on each safepoint - we need the two to stay in
+  // sync and the clear patterns of a SelectionDAGBuilder have no relation
+  // to FunctionLoweringInfo.
+  AllocatedStackSlots.resize(Builder.FuncInfo.StatepointStackSlots.size());
+  for (size_t i = 0; i < AllocatedStackSlots.size(); i++) {
+    AllocatedStackSlots[i] = false;
+  }
+}
+void StatepointLoweringState::clear() {
+  Locations.clear();
+  RelocLocations.clear();
+  AllocatedStackSlots.clear();
+  assert(PendingGCRelocateCalls.empty() &&
+         "cleared before statepoint sequence completed");
+}
+
+SDValue
+StatepointLoweringState::allocateStackSlot(EVT ValueType,
+                                           SelectionDAGBuilder &Builder) {
+
+  NumSlotsAllocatedForStatepoints++;
+
+  // The basic scheme here is to first look for a previously created stack slot
+  // which is not in use (accounting for the fact arbitrary slots may already
+  // be reserved), or to create a new stack slot and use it.
+
+  // If this doesn't succeed in 40000 iterations, something is seriously wrong
+  for (int i = 0; i < 40000; i++) {
+    assert(Builder.FuncInfo.StatepointStackSlots.size() ==
+               AllocatedStackSlots.size() &&
+           "broken invariant");
+    const size_t NumSlots = AllocatedStackSlots.size();
+    assert(NextSlotToAllocate <= NumSlots && "broken invariant");
+
+    if (NextSlotToAllocate >= NumSlots) {
+      assert(NextSlotToAllocate == NumSlots);
+      // record stats
+      if (NumSlots + 1 > StatepointMaxSlotsRequired) {
+        StatepointMaxSlotsRequired = NumSlots + 1;
+      }
+
+      SDValue SpillSlot = Builder.DAG.CreateStackTemporary(ValueType);
+      const unsigned FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+      Builder.FuncInfo.StatepointStackSlots.push_back(FI);
+      AllocatedStackSlots.push_back(true);
+      return SpillSlot;
+    }
+    if (!AllocatedStackSlots[NextSlotToAllocate]) {
+      const int FI = Builder.FuncInfo.StatepointStackSlots[NextSlotToAllocate];
+      AllocatedStackSlots[NextSlotToAllocate] = true;
+      return Builder.DAG.getFrameIndex(FI, ValueType);
+    }
+    // Note: We deliberately choose to advance this only on the failing path.
+    // Doing so on the suceeding path involes a bit of complexity that caused a
+    // minor bug previously.  Unless performance shows this matters, please
+    // keep this code as simple as possible.
+    NextSlotToAllocate++;
+  }
+  llvm_unreachable("infinite loop?");
+}
+
+/// Try to find existing copies of the incoming values in stack slots used for
+/// statepoint spilling.  If we can find a spill slot for the incoming value,
+/// mark that slot as allocated, and reuse the same slot for this safepoint.
+/// This helps to avoid series of loads and stores that only serve to resuffle
+/// values on the stack between calls.
+static void reservePreviousStackSlotForValue(SDValue Incoming,
+                                             SelectionDAGBuilder &Builder) {
+
+  if (isa<ConstantSDNode>(Incoming) || isa<FrameIndexSDNode>(Incoming)) {
+    // We won't need to spill this, so no need to check for previously
+    // allocated stack slots
+    return;
+  }
+
+  SDValue Loc = Builder.StatepointLowering.getLocation(Incoming);
+  if (Loc.getNode()) {
+    // duplicates in input
+    return;
+  }
+
+  // Search back for the load from a stack slot pattern to find the original
+  // slot we allocated for this value.  We could extend this to deal with
+  // simple modification patterns, but simple dealing with trivial load/store
+  // sequences helps a lot already.
+  if (LoadSDNode *Load = dyn_cast<LoadSDNode>(Incoming)) {
+    if (auto *FI = dyn_cast<FrameIndexSDNode>(Load->getBasePtr())) {
+      const int Index = FI->getIndex();
+      auto Itr = std::find(Builder.FuncInfo.StatepointStackSlots.begin(),
+                           Builder.FuncInfo.StatepointStackSlots.end(), Index);
+      if (Itr == Builder.FuncInfo.StatepointStackSlots.end()) {
+        // not one of the lowering stack slots, can't reuse!
+        // TODO: Actually, we probably could reuse the stack slot if the value
+        // hasn't changed at all, but we'd need to look for intervening writes
+        return;
+      } else {
+        // This is one of our dedicated lowering slots
+        const int Offset =
+            std::distance(Builder.FuncInfo.StatepointStackSlots.begin(), Itr);
+        if (Builder.StatepointLowering.isStackSlotAllocated(Offset)) {
+          // stack slot already assigned to someone else, can't use it!
+          // TODO: currently we reserve space for gc arguments after doing
+          // normal allocation for deopt arguments.  We should reserve for
+          // _all_ deopt and gc arguments, then start allocating.  This
+          // will prevent some moves being inserted when vm state changes,
+          // but gc state doesn't between two calls.
+          return;
+        }
+        // Reserve this stack slot
+        Builder.StatepointLowering.reserveStackSlot(Offset);
+      }
+
+      // Cache this slot so we find it when going through the normal
+      // assignment loop.
+      SDValue Loc =
+          Builder.DAG.getTargetFrameIndex(Index, Incoming.getValueType());
+
+      Builder.StatepointLowering.setLocation(Incoming, Loc);
+    }
+  }
+
+  // TODO: handle case where a reloaded value flows through a phi to
+  // another safepoint.  e.g.
+  // bb1:
+  //  a' = relocated...
+  // bb2: % pred: bb1, bb3, bb4, etc.
+  //  a_phi = phi(a', ...)
+  // statepoint ... a_phi
+  // NOTE: This will require reasoning about cross basic block values.  This is
+  // decidedly non trivial and this might not be the right place to do it.  We
+  // don't really have the information we need here...
+
+  // TODO: handle simple updates.  If a value is modified and the original
+  // value is no longer live, it would be nice to put the modified value in the
+  // same slot.  This allows folding of the memory accesses for some
+  // instructions types (like an increment).
+  // statepoint (i)
+  // i1 = i+1
+  // statepoint (i1)
+}
+
+/// Remove any duplicate (as SDValues) from the derived pointer pairs.  This
+/// is not required for correctness.  It's purpose is to reduce the size of
+/// StackMap section.  It has no effect on the number of spill slots required
+/// or the actual lowering.
+static void removeDuplicatesGCPtrs(SmallVectorImpl<const Value *> &Bases,
+                                   SmallVectorImpl<const Value *> &Ptrs,
+                                   SmallVectorImpl<const Value *> &Relocs,
+                                   SelectionDAGBuilder &Builder) {
+
+  // This is horribly ineffecient, but I don't care right now
+  SmallSet<SDValue, 64> Seen;
+
+  SmallVector<const Value *, 64> NewBases, NewPtrs, NewRelocs;
+  for (size_t i = 0; i < Ptrs.size(); i++) {
+    SDValue SD = Builder.getValue(Ptrs[i]);
+    // Only add non-duplicates
+    if (Seen.count(SD) == 0) {
+      NewBases.push_back(Bases[i]);
+      NewPtrs.push_back(Ptrs[i]);
+      NewRelocs.push_back(Relocs[i]);
+    }
+    Seen.insert(SD);
+  }
+  assert(Bases.size() >= NewBases.size());
+  assert(Ptrs.size() >= NewPtrs.size());
+  assert(Relocs.size() >= NewRelocs.size());
+  Bases = NewBases;
+  Ptrs = NewPtrs;
+  Relocs = NewRelocs;
+  assert(Ptrs.size() == Bases.size());
+  assert(Ptrs.size() == Relocs.size());
+}
+
+/// Extract call from statepoint, lower it and return pointer to the
+/// call node. Also update NodeMap so that getValue(statepoint) will
+/// reference lowered call result
+static SDNode *lowerCallFromStatepoint(ImmutableStatepoint StatepointSite,
+                                       SelectionDAGBuilder &Builder) {
+
+  ImmutableCallSite CS(StatepointSite.getCallSite());
+
+  // Lower the actual call itself - This is a bit of a hack, but we want to
+  // avoid modifying the actual lowering code.  This is similiar in intent to
+  // the LowerCallOperands mechanism used by PATCHPOINT, but is structured
+  // differently.  Hopefully, this is slightly more robust w.r.t. calling
+  // convention, return values, and other function attributes.
+  Value *ActualCallee = const_cast<Value *>(StatepointSite.actualCallee());
+
+  std::vector<Value *> Args;
+  CallInst::const_op_iterator arg_begin = StatepointSite.call_args_begin();
+  CallInst::const_op_iterator arg_end = StatepointSite.call_args_end();
+  Args.insert(Args.end(), arg_begin, arg_end);
+  // TODO: remove the creation of a new instruction!  We should not be
+  // modifying the IR (even temporarily) at this point.
+  CallInst *Tmp = CallInst::Create(ActualCallee, Args);
+  Tmp->setTailCall(CS.isTailCall());
+  Tmp->setCallingConv(CS.getCallingConv());
+  Tmp->setAttributes(CS.getAttributes());
+  Builder.LowerCallTo(Tmp, Builder.getValue(ActualCallee), false);
+
+  // Handle the return value of the call iff any.
+  const bool HasDef = !Tmp->getType()->isVoidTy();
+  if (HasDef) {
+    // The value of the statepoint itself will be the value of call itself.
+    // We'll replace the actually call node shortly.  gc_result will grab
+    // this value.
+    Builder.setValue(CS.getInstruction(), Builder.getValue(Tmp));
+  } else {
+    // The token value is never used from here on, just generate a poison value
+    Builder.setValue(CS.getInstruction(), Builder.DAG.getIntPtrConstant(-1));
+  }
+  // Remove the fake entry we created so we don't have a hanging reference
+  // after we delete this node.
+  Builder.removeValue(Tmp);
+  delete Tmp;
+  Tmp = nullptr;
+
+  // Search for the call node
+  // The following code is essentially reverse engineering X86's
+  // LowerCallTo.
+  SDNode *CallNode = nullptr;
+
+  // We just emitted a call, so it should be last thing generated
+  SDValue Chain = Builder.DAG.getRoot();
+
+  // Find closest CALLSEQ_END walking back through lowered nodes if needed
+  SDNode *CallEnd = Chain.getNode();
+  int Sanity = 0;
+  while (CallEnd->getOpcode() != ISD::CALLSEQ_END) {
+    CallEnd = CallEnd->getGluedNode();
+    assert(CallEnd && "Can not find call node");
+    assert(Sanity < 20 && "should have found call end already");
+    Sanity++;
+  }
+  assert(CallEnd->getOpcode() == ISD::CALLSEQ_END &&
+         "Expected a callseq node.");
+  assert(CallEnd->getGluedNode());
+
+  // Step back inside the CALLSEQ
+  CallNode = CallEnd->getGluedNode();
+  return CallNode;
+}
+
+/// Callect all gc pointers coming into statepoint intrinsic, clean them up,
+/// and return two arrays:
+///   Bases - base pointers incoming to this statepoint
+///   Ptrs - derived pointers incoming to this statepoint
+///   Relocs - the gc_relocate corresponding to each base/ptr pair
+/// Elements of this arrays should be in one-to-one correspondence with each
+/// other i.e Bases[i], Ptrs[i] are from the same gcrelocate call
+static void
+getIncomingStatepointGCValues(SmallVectorImpl<const Value *> &Bases,
+                              SmallVectorImpl<const Value *> &Ptrs,
+                              SmallVectorImpl<const Value *> &Relocs,
+                              ImmutableStatepoint StatepointSite,
+                              SelectionDAGBuilder &Builder) {
+  for (GCRelocateOperands relocateOpers :
+         StatepointSite.getRelocates(StatepointSite)) {
+    Relocs.push_back(relocateOpers.getUnderlyingCallSite().getInstruction());
+    Bases.push_back(relocateOpers.basePtr());
+    Ptrs.push_back(relocateOpers.derivedPtr());
+  }
+
+  // Remove any redundant llvm::Values which map to the same SDValue as another
+  // input.  Also has the effect of removing duplicates in the original
+  // llvm::Value input list as well.  This is a useful optimization for
+  // reducing the size of the StackMap section.  It has no other impact.
+  removeDuplicatesGCPtrs(Bases, Ptrs, Relocs, Builder);
+
+  assert(Bases.size() == Ptrs.size() && Ptrs.size() == Relocs.size());
+}
+
+/// Spill a value incoming to the statepoint. It might be either part of
+/// vmstate
+/// or gcstate. In both cases unconditionally spill it on the stack unless it
+/// is a null constant. Return pair with first element being frame index
+/// containing saved value and second element with outgoing chain from the
+/// emitted store
+static std::pair<SDValue, SDValue>
+spillIncomingStatepointValue(SDValue Incoming, SDValue Chain,
+                             SelectionDAGBuilder &Builder) {
+  SDValue Loc = Builder.StatepointLowering.getLocation(Incoming);
+
+  // Emit new store if we didn't do it for this ptr before
+  if (!Loc.getNode()) {
+    Loc = Builder.StatepointLowering.allocateStackSlot(Incoming.getValueType(),
+                                                       Builder);
+    assert(isa<FrameIndexSDNode>(Loc));
+    int Index = cast<FrameIndexSDNode>(Loc)->getIndex();
+    // We use TargetFrameIndex so that isel will not select it into LEA
+    Loc = Builder.DAG.getTargetFrameIndex(Index, Incoming.getValueType());
+
+    // TODO: We can create TokenFactor node instead of
+    //       chaining stores one after another, this may allow
+    //       a bit more optimal scheduling for them
+    Chain = Builder.DAG.getStore(Chain, Builder.getCurSDLoc(), Incoming, Loc,
+                                 MachinePointerInfo::getFixedStack(Index),
+                                 false, false, 0);
+
+    Builder.StatepointLowering.setLocation(Incoming, Loc);
+  }
+
+  assert(Loc.getNode());
+  return std::make_pair(Loc, Chain);
+}
+
+/// Lower a single value incoming to a statepoint node.  This value can be
+/// either a deopt value or a gc value, the handling is the same.  We special
+/// case constants and allocas, then fall back to spilling if required.
+static void lowerIncomingStatepointValue(SDValue Incoming,
+                                         SmallVectorImpl<SDValue> &Ops,
+                                         SelectionDAGBuilder &Builder) {
+  SDValue Chain = Builder.getRoot();
+
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Incoming)) {
+    // If the original value was a constant, make sure it gets recorded as
+    // such in the stackmap.  This is required so that the consumer can
+    // parse any internal format to the deopt state.  It also handles null
+    // pointers and other constant pointers in GC states
+    Ops.push_back(
+        Builder.DAG.getTargetConstant(StackMaps::ConstantOp, MVT::i64));
+    Ops.push_back(Builder.DAG.getTargetConstant(C->getSExtValue(), MVT::i64));
+  } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Incoming)) {
+    // This handles allocas as arguments to the statepoint
+    const TargetLowering &TLI = Builder.DAG.getTargetLoweringInfo();
+    Ops.push_back(
+        Builder.DAG.getTargetFrameIndex(FI->getIndex(), TLI.getPointerTy()));
+  } else {
+    // Otherwise, locate a spill slot and explicitly spill it so it
+    // can be found by the runtime later.  We currently do not support
+    // tracking values through callee saved registers to their eventual
+    // spill location.  This would be a useful optimization, but would
+    // need to be optional since it requires a lot of complexity on the
+    // runtime side which not all would support.
+    std::pair<SDValue, SDValue> Res =
+        spillIncomingStatepointValue(Incoming, Chain, Builder);
+    Ops.push_back(Res.first);
+    Chain = Res.second;
+  }
+
+  Builder.DAG.setRoot(Chain);
+}
+
+/// Lower deopt state and gc pointer arguments of the statepoint.  The actual
+/// lowering is described in lowerIncomingStatepointValue.  This function is
+/// responsible for lowering everything in the right position and playing some
+/// tricks to avoid redundant stack manipulation where possible.  On
+/// completion, 'Ops' will contain ready to use operands for machine code
+/// statepoint. The chain nodes will have already been created and the DAG root
+/// will be set to the last value spilled (if any were).
+static void lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
+                                    ImmutableStatepoint StatepointSite,
+                                    SelectionDAGBuilder &Builder) {
+
+  // Lower the deopt and gc arguments for this statepoint.  Layout will
+  // be: deopt argument length, deopt arguments.., gc arguments...
+
+  SmallVector<const Value *, 64> Bases, Ptrs, Relocations;
+  getIncomingStatepointGCValues(Bases, Ptrs, Relocations,
+                                StatepointSite, Builder);
+
+#ifndef NDEBUG
+  // Check that each of the gc pointer and bases we've gotten out of the
+  // safepoint is something the strategy thinks might be a pointer into the GC
+  // heap.  This is basically just here to help catch errors during statepoint
+  // insertion. TODO: This should actually be in the Verifier, but we can't get
+  // to the GCStrategy from there (yet).
+  if (Builder.GFI) {
+    GCStrategy &S = Builder.GFI->getStrategy();
+    for (const Value *V : Bases) {
+      auto Opt = S.isGCManagedPointer(V);
+      if (Opt.hasValue()) {
+        assert(Opt.getValue() &&
+               "non gc managed base pointer found in statepoint");
+      }
+    }
+    for (const Value *V : Ptrs) {
+      auto Opt = S.isGCManagedPointer(V);
+      if (Opt.hasValue()) {
+        assert(Opt.getValue() &&
+               "non gc managed derived pointer found in statepoint");
+      }
+    }
+    for (const Value *V : Relocations) {
+      auto Opt = S.isGCManagedPointer(V);
+      if (Opt.hasValue()) {
+        assert(Opt.getValue() && "non gc managed pointer relocated");
+      }
+    }
+  }
+#endif
+
+
+
+  // Before we actually start lowering (and allocating spill slots for values),
+  // reserve any stack slots which we judge to be profitable to reuse for a
+  // particular value.  This is purely an optimization over the code below and
+  // doesn't change semantics at all.  It is important for performance that we
+  // reserve slots for both deopt and gc values before lowering either.
+  for (auto I = StatepointSite.vm_state_begin() + 1,
+            E = StatepointSite.vm_state_end();
+       I != E; ++I) {
+    Value *V = *I;
+    SDValue Incoming = Builder.getValue(V);
+    reservePreviousStackSlotForValue(Incoming, Builder);
+  }
+  for (unsigned i = 0; i < Bases.size() * 2; ++i) {
+    // Even elements will contain base, odd elements - derived ptr
+    const Value *V = i % 2 ? Bases[i / 2] : Ptrs[i / 2];
+    SDValue Incoming = Builder.getValue(V);
+    reservePreviousStackSlotForValue(Incoming, Builder);
+  }
+
+  // First, prefix the list with the number of unique values to be
+  // lowered.  Note that this is the number of *Values* not the
+  // number of SDValues required to lower them.
+  const int NumVMSArgs = StatepointSite.numTotalVMSArgs();
+  Ops.push_back(
+      Builder.DAG.getTargetConstant(StackMaps::ConstantOp, MVT::i64));
+  Ops.push_back(Builder.DAG.getTargetConstant(NumVMSArgs, MVT::i64));
+
+  assert(NumVMSArgs + 1 == std::distance(StatepointSite.vm_state_begin(),
+                                         StatepointSite.vm_state_end()));
+
+  // The vm state arguments are lowered in an opaque manner.  We do
+  // not know what type of values are contained within.  We skip the
+  // first one since that happens to be the total number we lowered
+  // explicitly just above.  We could have left it in the loop and
+  // not done it explicitly, but it's far easier to understand this
+  // way.
+  for (auto I = StatepointSite.vm_state_begin() + 1,
+            E = StatepointSite.vm_state_end();
+       I != E; ++I) {
+    const Value *V = *I;
+    SDValue Incoming = Builder.getValue(V);
+    lowerIncomingStatepointValue(Incoming, Ops, Builder);
+  }
+
+  // Finally, go ahead and lower all the gc arguments.  There's no prefixed
+  // length for this one.  After lowering, we'll have the base and pointer
+  // arrays interwoven with each (lowered) base pointer immediately followed by
+  // it's (lowered) derived pointer.  i.e
+  // (base[0], ptr[0], base[1], ptr[1], ...)
+  for (unsigned i = 0; i < Bases.size() * 2; ++i) {
+    // Even elements will contain base, odd elements - derived ptr
+    const Value *V = i % 2 ? Bases[i / 2] : Ptrs[i / 2];
+    SDValue Incoming = Builder.getValue(V);
+    lowerIncomingStatepointValue(Incoming, Ops, Builder);
+  }
+}
+
+void SelectionDAGBuilder::visitStatepoint(const CallInst &CI) {
+  // Check some preconditions for sanity
+  assert(isStatepoint(&CI) &&
+         "function called must be the statepoint function");
+
+  LowerStatepoint(ImmutableStatepoint(&CI));
+}
+
+void SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP) {
+  // The basic scheme here is that information about both the original call and
+  // the safepoint is encoded in the CallInst.  We create a temporary call and
+  // lower it, then reverse engineer the calling sequence.
+
+  NumOfStatepoints++;
+  // Clear state
+  StatepointLowering.startNewStatepoint(*this);
+
+  ImmutableCallSite CS(ISP.getCallSite());
+
+#ifndef NDEBUG
+  // Consistency check
+  for (const User *U : CS->users()) {
+    const CallInst *Call = cast<CallInst>(U);
+    if (isGCRelocate(Call))
+      StatepointLowering.scheduleRelocCall(*Call);
+  }
+#endif
+
+#ifndef NDEBUG
+  // If this is a malformed statepoint, report it early to simplify debugging.
+  // This should catch any IR level mistake that's made when constructing or
+  // transforming statepoints.
+  ISP.verify();
+
+  // Check that the associated GCStrategy expects to encounter statepoints.
+  // TODO: This if should become an assert.  For now, we allow the GCStrategy
+  // to be optional for backwards compatibility.  This will only last a short
+  // period (i.e. a couple of weeks).
+  if (GFI) {
+    assert(GFI->getStrategy().useStatepoints() &&
+           "GCStrategy does not expect to encounter statepoints");
+  }
+#endif
+
+
+  // Lower statepoint vmstate and gcstate arguments
+  SmallVector<SDValue, 10> LoweredArgs;
+  lowerStatepointMetaArgs(LoweredArgs, ISP, *this);
+
+  // Get call node, we will replace it later with statepoint
+  SDNode *CallNode = lowerCallFromStatepoint(ISP, *this);
+
+  // Construct the actual STATEPOINT node with all the appropriate arguments
+  // and return values.
+
+  // TODO: Currently, all of these operands are being marked as read/write in
+  // PrologEpilougeInserter.cpp, we should special case the VMState arguments
+  // and flags to be read-only.
+  SmallVector<SDValue, 40> Ops;
+
+  // Calculate and push starting position of vmstate arguments
+  // Call Node: Chain, Target, {Args}, RegMask, [Glue]
+  SDValue Glue;
+  if (CallNode->getGluedNode()) {
+    // Glue is always last operand
+    Glue = CallNode->getOperand(CallNode->getNumOperands() - 1);
+  }
+  // Get number of arguments incoming directly into call node
+  unsigned NumCallRegArgs =
+      CallNode->getNumOperands() - (Glue.getNode() ? 4 : 3);
+  Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, MVT::i32));
+
+  // Add call target
+  SDValue CallTarget = SDValue(CallNode->getOperand(1).getNode(), 0);
+  Ops.push_back(CallTarget);
+
+  // Add call arguments
+  // Get position of register mask in the call
+  SDNode::op_iterator RegMaskIt;
+  if (Glue.getNode())
+    RegMaskIt = CallNode->op_end() - 2;
+  else
+    RegMaskIt = CallNode->op_end() - 1;
+  Ops.insert(Ops.end(), CallNode->op_begin() + 2, RegMaskIt);
+
+  // Add a leading constant argument with the Flags and the calling convention
+  // masked together
+  CallingConv::ID CallConv = CS.getCallingConv();
+  int Flags = dyn_cast<ConstantInt>(CS.getArgument(2))->getZExtValue();
+  assert(Flags == 0 && "not expected to be used");
+  Ops.push_back(DAG.getTargetConstant(StackMaps::ConstantOp, MVT::i64));
+  Ops.push_back(
+      DAG.getTargetConstant(Flags | ((unsigned)CallConv << 1), MVT::i64));
+
+  // Insert all vmstate and gcstate arguments
+  Ops.insert(Ops.end(), LoweredArgs.begin(), LoweredArgs.end());
+
+  // Add register mask from call node
+  Ops.push_back(*RegMaskIt);
+
+  // Add chain
+  Ops.push_back(CallNode->getOperand(0));
+
+  // Same for the glue, but we add it only if original call had it
+  if (Glue.getNode())
+    Ops.push_back(Glue);
+
+  // Compute return values.  Provide a glue output since we consume one as
+  // input.  This allows someone else to chain off us as needed.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  SDNode *StatepointMCNode = DAG.getMachineNode(TargetOpcode::STATEPOINT,
+                                                getCurSDLoc(), NodeTys, Ops);
+
+  // Replace original call
+  DAG.ReplaceAllUsesWith(CallNode, StatepointMCNode); // This may update Root
+  // Remove originall call node
+  DAG.DeleteNode(CallNode);
+
+  // DON'T set the root - under the assumption that it's already set past the
+  // inserted node we created.
+
+  // TODO: A better future implementation would be to emit a single variable
+  // argument, variable return value STATEPOINT node here and then hookup the
+  // return value of each gc.relocate to the respective output of the
+  // previously emitted STATEPOINT value.  Unfortunately, this doesn't appear
+  // to actually be possible today.
+}
+
+void SelectionDAGBuilder::visitGCResult(const CallInst &CI) {
+  // The result value of the gc_result is simply the result of the actual
+  // call.  We've already emitted this, so just grab the value.
+  Instruction *I = cast<Instruction>(CI.getArgOperand(0));
+  assert(isStatepoint(I) &&
+         "first argument must be a statepoint token");
+
+  setValue(&CI, getValue(I));
+}
+
+void SelectionDAGBuilder::visitGCRelocate(const CallInst &CI) {
+#ifndef NDEBUG
+  // Consistency check
+  StatepointLowering.relocCallVisited(CI);
+#endif
+
+  GCRelocateOperands relocateOpers(&CI);
+  SDValue SD = getValue(relocateOpers.derivedPtr());
+
+  if (isa<ConstantSDNode>(SD) || isa<FrameIndexSDNode>(SD)) {
+    // We didn't need to spill these special cases (constants and allocas).
+    // See the handling in spillIncomingValueForStatepoint for detail.
+    setValue(&CI, SD);
+    return;
+  }
+
+  SDValue Loc = StatepointLowering.getRelocLocation(SD);
+  // Emit new load if we did not emit it before
+  if (!Loc.getNode()) {
+    SDValue SpillSlot = StatepointLowering.getLocation(SD);
+    int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+
+    // Be conservative: flush all pending loads
+    // TODO: Probably we can be less restrictive on this,
+    // it may allow more scheduling opprtunities
+    SDValue Chain = getRoot();
+
+    Loc = DAG.getLoad(SpillSlot.getValueType(), getCurSDLoc(), Chain,
+                      SpillSlot, MachinePointerInfo::getFixedStack(FI), false,
+                      false, false, 0);
+
+    StatepointLowering.setRelocLocation(SD, Loc);
+
+    // Again, be conservative, don't emit pending loads
+    DAG.setRoot(Loc.getValue(1));
+  }
+
+  assert(Loc.getNode());
+  setValue(&CI, Loc);
+}

diff --git a/lib/CodeGen/SelectionDAG/StatepointLowering.h b/lib/CodeGen/SelectionDAG/StatepointLowering.h
new file mode 100644
index 0000000..673112c
--- /dev/null
+++ b/lib/CodeGen/SelectionDAG/StatepointLowering.h

@@ -0,0 +1,138 @@
+//===-- StatepointLowering.h - SDAGBuilder's statepoint code -*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes support code use by SelectionDAGBuilder when lowering a
+// statepoint sequence in SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_STATEPOINTLOWERING_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_STATEPOINTLOWERING_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include <vector>
+
+namespace llvm {
+class SelectionDAGBuilder;
+
+/// This class tracks both per-statepoint and per-selectiondag information.
+/// For each statepoint it tracks locations of it's gc valuess (incoming and
+/// relocated) and list of gcreloc calls scheduled for visiting (this is
+/// used for a debug mode consistency check only).  The spill slot tracking
+/// works in concert with information in FunctionLoweringInfo.
+class StatepointLoweringState {
+public:
+  StatepointLoweringState() : NextSlotToAllocate(0) {
+  }
+
+  /// Reset all state tracking for a newly encountered safepoint.  Also
+  /// performs some consistency checking.
+  void startNewStatepoint(SelectionDAGBuilder &Builder);
+
+  /// Clear the memory usage of this object.  This is called from
+  /// SelectionDAGBuilder::clear.  We require this is never called in the
+  /// midst of processing a statepoint sequence.
+  void clear();
+
+  /// Returns the spill location of a value incoming to the current
+  /// statepoint.  Will return SDValue() if this value hasn't been
+  /// spilled.  Otherwise, the value has already been spilled and no
+  /// further action is required by the caller.
+  SDValue getLocation(SDValue val) {
+    if (!Locations.count(val))
+      return SDValue();
+    return Locations[val];
+  }
+  void setLocation(SDValue val, SDValue Location) {
+    assert(!Locations.count(val) &&
+           "Trying to allocate already allocated location");
+    Locations[val] = Location;
+  }
+
+  /// Returns the relocated value for a given input pointer. Will
+  /// return SDValue() if this value hasn't yet been reloaded from
+  /// it's stack slot after the statepoint.  Otherwise, the value
+  /// has already been reloaded and the SDValue of that reload will
+  /// be returned. Note that VMState values are spilled but not
+  /// reloaded (since they don't change at the safepoint unless
+  /// also listed in the GC pointer section) and will thus never
+  /// be in this map
+  SDValue getRelocLocation(SDValue val) {
+    if (!RelocLocations.count(val))
+      return SDValue();
+    return RelocLocations[val];
+  }
+  void setRelocLocation(SDValue val, SDValue Location) {
+    assert(!RelocLocations.count(val) &&
+           "Trying to allocate already allocated location");
+    RelocLocations[val] = Location;
+  }
+
+  /// Record the fact that we expect to encounter a given gc_relocate
+  /// before the next statepoint.  If we don't see it, we'll report
+  /// an assertion.
+  void scheduleRelocCall(const CallInst &RelocCall) {
+    PendingGCRelocateCalls.push_back(&RelocCall);
+  }
+  /// Remove this gc_relocate from the list we're expecting to see
+  /// before the next statepoint.  If we weren't expecting to see
+  /// it, we'll report an assertion.
+  void relocCallVisited(const CallInst &RelocCall) {
+    SmallVectorImpl<const CallInst *>::iterator itr =
+        std::find(PendingGCRelocateCalls.begin(), PendingGCRelocateCalls.end(),
+                  &RelocCall);
+    assert(itr != PendingGCRelocateCalls.end() &&
+           "Visited unexpected gcrelocate call");
+    PendingGCRelocateCalls.erase(itr);
+  }
+
+  // TODO: Should add consistency tracking to ensure we encounter
+  // expected gc_result calls too.
+
+  /// Get a stack slot we can use to store an value of type ValueType.  This
+  /// will hopefully be a recylced slot from another statepoint.
+  SDValue allocateStackSlot(EVT ValueType, SelectionDAGBuilder &Builder);
+
+  void reserveStackSlot(int Offset) {
+    assert(Offset >= 0 && Offset < (int)AllocatedStackSlots.size() &&
+           "out of bounds");
+    assert(!AllocatedStackSlots[Offset] && "already reserved!");
+    assert(NextSlotToAllocate <= (unsigned)Offset && "consistency!");
+    AllocatedStackSlots[Offset] = true;
+  }
+  bool isStackSlotAllocated(int Offset) {
+    assert(Offset >= 0 && Offset < (int)AllocatedStackSlots.size() &&
+           "out of bounds");
+    return AllocatedStackSlots[Offset];
+  }
+
+private:
+  /// Maps pre-relocation value (gc pointer directly incoming into statepoint)
+  /// into it's location (currently only stack slots)
+  DenseMap<SDValue, SDValue> Locations;
+  /// Map pre-relocated value into it's new relocated location
+  DenseMap<SDValue, SDValue> RelocLocations;
+
+  /// A boolean indicator for each slot listed in the FunctionInfo as to
+  /// whether it has been used in the current statepoint.  Since we try to
+  /// preserve stack slots across safepoints, there can be gaps in which
+  /// slots have been allocated.
+  SmallVector<bool, 50> AllocatedStackSlots;
+
+  /// Points just beyond the last slot known to have been allocated
+  unsigned NextSlotToAllocate;
+
+  /// Keep track of pending gcrelocate calls for consistency check
+  SmallVector<const CallInst *, 10> PendingGCRelocateCalls;
+};
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_SELECTIONDAG_STATEPOINTLOWERING_H

diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 9aef5ed..0a3c926 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp

@@ -793,19 +793,26 @@
 
     APInt MsbMask = APInt::getHighBitsSet(BitWidth, 1);
     // If we only care about the highest bit, don't bother shifting right.
-    if (MsbMask == DemandedMask) {
+    if (MsbMask == NewMask) {
       unsigned ShAmt = ExVT.getScalarType().getSizeInBits();
       SDValue InOp = Op.getOperand(0);
+      unsigned VTBits = Op->getValueType(0).getScalarType().getSizeInBits();
+      bool AlreadySignExtended =
+        TLO.DAG.ComputeNumSignBits(InOp) >= VTBits-ShAmt+1;
+      // However if the input is already sign extended we expect the sign
+      // extension to be dropped altogether later and do not simplify.
+      if (!AlreadySignExtended) {
+        // Compute the correct shift amount type, which must be getShiftAmountTy
+        // for scalar types after legalization.
+        EVT ShiftAmtTy = Op.getValueType();
+        if (TLO.LegalTypes() && !ShiftAmtTy.isVector())
+          ShiftAmtTy = getShiftAmountTy(ShiftAmtTy);
 
-      // Compute the correct shift amount type, which must be getShiftAmountTy
-      // for scalar types after legalization.
-      EVT ShiftAmtTy = Op.getValueType();
-      if (TLO.LegalTypes() && !ShiftAmtTy.isVector())
-        ShiftAmtTy = getShiftAmountTy(ShiftAmtTy);
-
-      SDValue ShiftAmt = TLO.DAG.getConstant(BitWidth - ShAmt, ShiftAmtTy);
-      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl,
-                                            Op.getValueType(), InOp, ShiftAmt));
+        SDValue ShiftAmt = TLO.DAG.getConstant(BitWidth - ShAmt, ShiftAmtTy);
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl,
+                                                 Op.getValueType(), InOp,
+                                                 ShiftAmt));
+      }
     }
 
     // Sign extension.  Compute the demanded bits in the result that are not
@@ -1283,36 +1290,53 @@
     }
 
     // (zext x) == C --> x == (trunc C)
-    if (DCI.isBeforeLegalize() && N0->hasOneUse() &&
-        (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+    // (sext x) == C --> x == (trunc C)
+    if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+        DCI.isBeforeLegalize() && N0->hasOneUse()) {
       unsigned MinBits = N0.getValueSizeInBits();
-      SDValue PreZExt;
+      SDValue PreExt;
+      bool Signed = false;
       if (N0->getOpcode() == ISD::ZERO_EXTEND) {
         // ZExt
         MinBits = N0->getOperand(0).getValueSizeInBits();
-        PreZExt = N0->getOperand(0);
+        PreExt = N0->getOperand(0);
       } else if (N0->getOpcode() == ISD::AND) {
         // DAGCombine turns costly ZExts into ANDs
         if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0->getOperand(1)))
           if ((C->getAPIntValue()+1).isPowerOf2()) {
             MinBits = C->getAPIntValue().countTrailingOnes();
-            PreZExt = N0->getOperand(0);
+            PreExt = N0->getOperand(0);
           }
+      } else if (N0->getOpcode() == ISD::SIGN_EXTEND) {
+        // SExt
+        MinBits = N0->getOperand(0).getValueSizeInBits();
+        PreExt = N0->getOperand(0);
+        Signed = true;
       } else if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(N0)) {
-        // ZEXTLOAD
+        // ZEXTLOAD / SEXTLOAD
         if (LN0->getExtensionType() == ISD::ZEXTLOAD) {
           MinBits = LN0->getMemoryVT().getSizeInBits();
-          PreZExt = N0;
+          PreExt = N0;
+        } else if (LN0->getExtensionType() == ISD::SEXTLOAD) {
+          Signed = true;
+          MinBits = LN0->getMemoryVT().getSizeInBits();
+          PreExt = N0;
         }
       }
 
+      // Figure out how many bits we need to preserve this constant.
+      unsigned ReqdBits = Signed ?
+        C1.getBitWidth() - C1.getNumSignBits() + 1 :
+        C1.getActiveBits();
+
       // Make sure we're not losing bits from the constant.
       if (MinBits > 0 &&
-          MinBits < C1.getBitWidth() && MinBits >= C1.getActiveBits()) {
+          MinBits < C1.getBitWidth() &&
+          MinBits >= ReqdBits) {
         EVT MinVT = EVT::getIntegerVT(*DAG.getContext(), MinBits);
         if (isTypeDesirableForOp(ISD::SETCC, MinVT)) {
           // Will get folded away.
-          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MinVT, PreZExt);
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MinVT, PreExt);
           SDValue C = DAG.getConstant(C1.trunc(MinBits), MinVT);
           return DAG.getSetCC(dl, VT, Trunc, C, Cond);
         }
@@ -2163,9 +2187,10 @@
   }
 }
 
-std::pair<unsigned, const TargetRegisterClass*> TargetLowering::
-getRegForInlineAsmConstraint(const std::string &Constraint,
-                             MVT VT) const {
+std::pair<unsigned, const TargetRegisterClass *>
+TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI,
+                                             const std::string &Constraint,
+                                             MVT VT) const {
   if (Constraint.empty() || Constraint[0] != '{')
     return std::make_pair(0u, static_cast<TargetRegisterClass*>(nullptr));
   assert(*(Constraint.end()-1) == '}' && "Not a brace enclosed constraint?");
@@ -2177,8 +2202,6 @@
     std::make_pair(0u, static_cast<const TargetRegisterClass*>(nullptr));
 
   // Figure out which register class contains this reg.
-  const TargetRegisterInfo *RI =
-      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
   for (TargetRegisterInfo::regclass_iterator RCI = RI->regclass_begin(),
        E = RI->regclass_end(); RCI != E; ++RCI) {
     const TargetRegisterClass *RC = *RCI;
@@ -2231,8 +2254,9 @@
 /// and also tie in the associated operand values.
 /// If this returns an empty vector, and if the constraint string itself
 /// isn't empty, there was an error parsing.
-TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
-    ImmutableCallSite CS) const {
+TargetLowering::AsmOperandInfoVector
+TargetLowering::ParseConstraints(const TargetRegisterInfo *TRI,
+                                 ImmutableCallSite CS) const {
   /// ConstraintOperands - Information about all of the constraints.
   AsmOperandInfoVector ConstraintOperands;
   const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
@@ -2323,7 +2347,7 @@
   }
 
   // If we have multiple alternative constraints, select the best alternative.
-  if (ConstraintOperands.size()) {
+  if (!ConstraintOperands.empty()) {
     if (maCount) {
       unsigned bestMAIndex = 0;
       int bestWeight = -1;
@@ -2394,12 +2418,12 @@
       AsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
 
       if (OpInfo.ConstraintVT != Input.ConstraintVT) {
-        std::pair<unsigned, const TargetRegisterClass*> MatchRC =
-          getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
-                                       OpInfo.ConstraintVT);
-        std::pair<unsigned, const TargetRegisterClass*> InputRC =
-          getRegForInlineAsmConstraint(Input.ConstraintCode,
-                                       Input.ConstraintVT);
+        std::pair<unsigned, const TargetRegisterClass *> MatchRC =
+            getRegForInlineAsmConstraint(TRI, OpInfo.ConstraintCode,
+                                         OpInfo.ConstraintVT);
+        std::pair<unsigned, const TargetRegisterClass *> InputRC =
+            getRegForInlineAsmConstraint(TRI, Input.ConstraintCode,
+                                         Input.ConstraintVT);
         if ((OpInfo.ConstraintVT.isInteger() !=
              Input.ConstraintVT.isInteger()) ||
             (MatchRC.second != InputRC.second)) {

diff --git a/lib/CodeGen/ShadowStackGC.cpp b/lib/CodeGen/ShadowStackGC.cpp
index 0be00f0..b12e943 100644
--- a/lib/CodeGen/ShadowStackGC.cpp
+++ b/lib/CodeGen/ShadowStackGC.cpp

@@ -38,416 +38,18 @@
 #define DEBUG_TYPE "shadowstackgc"
 
 namespace {
-
-  class ShadowStackGC : public GCStrategy {
-    /// RootChain - This is the global linked-list that contains the chain of GC
-    /// roots.
-    GlobalVariable *Head;
-
-    /// StackEntryTy - Abstract type of a link in the shadow stack.
-    ///
-    StructType *StackEntryTy;
-    StructType *FrameMapTy;
-
-    /// Roots - GC roots in the current function. Each is a pair of the
-    /// intrinsic call and its corresponding alloca.
-    std::vector<std::pair<CallInst*,AllocaInst*> > Roots;
-
-  public:
-    ShadowStackGC();
-
-    bool initializeCustomLowering(Module &M) override;
-    bool performCustomLowering(Function &F) override;
-
-  private:
-    bool IsNullValue(Value *V);
-    Constant *GetFrameMap(Function &F);
-    Type* GetConcreteStackEntryType(Function &F);
-    void CollectRoots(Function &F);
-    static GetElementPtrInst *CreateGEP(LLVMContext &Context, 
-                                        IRBuilder<> &B, Value *BasePtr,
-                                        int Idx1, const char *Name);
-    static GetElementPtrInst *CreateGEP(LLVMContext &Context,
-                                        IRBuilder<> &B, Value *BasePtr,
-                                        int Idx1, int Idx2, const char *Name);
-  };
-
+class ShadowStackGC : public GCStrategy {
+public:
+  ShadowStackGC();
+};
 }
 
 static GCRegistry::Add<ShadowStackGC>
-X("shadow-stack", "Very portable GC for uncooperative code generators");
+    X("shadow-stack", "Very portable GC for uncooperative code generators");
 
-namespace {
-  /// EscapeEnumerator - This is a little algorithm to find all escape points
-  /// from a function so that "finally"-style code can be inserted. In addition
-  /// to finding the existing return and unwind instructions, it also (if
-  /// necessary) transforms any call instructions into invokes and sends them to
-  /// a landing pad.
-  ///
-  /// It's wrapped up in a state machine using the same transform C# uses for
-  /// 'yield return' enumerators, This transform allows it to be non-allocating.
-  class EscapeEnumerator {
-    Function &F;
-    const char *CleanupBBName;
+void llvm::linkShadowStackGC() {}
 
-    // State.
-    int State;
-    Function::iterator StateBB, StateE;
-    IRBuilder<> Builder;
-
-  public:
-    EscapeEnumerator(Function &F, const char *N = "cleanup")
-      : F(F), CleanupBBName(N), State(0), Builder(F.getContext()) {}
-
-    IRBuilder<> *Next() {
-      switch (State) {
-      default:
-        return nullptr;
-
-      case 0:
-        StateBB = F.begin();
-        StateE = F.end();
-        State = 1;
-
-      case 1:
-        // Find all 'return', 'resume', and 'unwind' instructions.
-        while (StateBB != StateE) {
-          BasicBlock *CurBB = StateBB++;
-
-          // Branches and invokes do not escape, only unwind, resume, and return
-          // do.
-          TerminatorInst *TI = CurBB->getTerminator();
-          if (!isa<ReturnInst>(TI) && !isa<ResumeInst>(TI))
-            continue;
-
-          Builder.SetInsertPoint(TI->getParent(), TI);
-          return &Builder;
-        }
-
-        State = 2;
-
-        // Find all 'call' instructions.
-        SmallVector<Instruction*,16> Calls;
-        for (Function::iterator BB = F.begin(),
-                                E = F.end(); BB != E; ++BB)
-          for (BasicBlock::iterator II = BB->begin(),
-                                    EE = BB->end(); II != EE; ++II)
-            if (CallInst *CI = dyn_cast<CallInst>(II))
-              if (!CI->getCalledFunction() ||
-                  !CI->getCalledFunction()->getIntrinsicID())
-                Calls.push_back(CI);
-
-        if (Calls.empty())
-          return nullptr;
-
-        // Create a cleanup block.
-        LLVMContext &C = F.getContext();
-        BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
-        Type *ExnTy = StructType::get(Type::getInt8PtrTy(C),
-                                      Type::getInt32Ty(C), nullptr);
-        Constant *PersFn =
-          F.getParent()->
-          getOrInsertFunction("__gcc_personality_v0",
-                              FunctionType::get(Type::getInt32Ty(C), true));
-        LandingPadInst *LPad = LandingPadInst::Create(ExnTy, PersFn, 1,
-                                                      "cleanup.lpad",
-                                                      CleanupBB);
-        LPad->setCleanup(true);
-        ResumeInst *RI = ResumeInst::Create(LPad, CleanupBB);
-
-        // Transform the 'call' instructions into 'invoke's branching to the
-        // cleanup block. Go in reverse order to make prettier BB names.
-        SmallVector<Value*,16> Args;
-        for (unsigned I = Calls.size(); I != 0; ) {
-          CallInst *CI = cast<CallInst>(Calls[--I]);
-
-          // Split the basic block containing the function call.
-          BasicBlock *CallBB = CI->getParent();
-          BasicBlock *NewBB =
-            CallBB->splitBasicBlock(CI, CallBB->getName() + ".cont");
-
-          // Remove the unconditional branch inserted at the end of CallBB.
-          CallBB->getInstList().pop_back();
-          NewBB->getInstList().remove(CI);
-
-          // Create a new invoke instruction.
-          Args.clear();
-          CallSite CS(CI);
-          Args.append(CS.arg_begin(), CS.arg_end());
-
-          InvokeInst *II = InvokeInst::Create(CI->getCalledValue(),
-                                              NewBB, CleanupBB,
-                                              Args, CI->getName(), CallBB);
-          II->setCallingConv(CI->getCallingConv());
-          II->setAttributes(CI->getAttributes());
-          CI->replaceAllUsesWith(II);
-          delete CI;
-        }
-
-        Builder.SetInsertPoint(RI->getParent(), RI);
-        return &Builder;
-      }
-    }
-  };
-}
-
-// -----------------------------------------------------------------------------
-
-void llvm::linkShadowStackGC() { }
-
-ShadowStackGC::ShadowStackGC() : Head(nullptr), StackEntryTy(nullptr) {
+ShadowStackGC::ShadowStackGC() {
   InitRoots = true;
   CustomRoots = true;
 }
-
-Constant *ShadowStackGC::GetFrameMap(Function &F) {
-  // doInitialization creates the abstract type of this value.
-  Type *VoidPtr = Type::getInt8PtrTy(F.getContext());
-
-  // Truncate the ShadowStackDescriptor if some metadata is null.
-  unsigned NumMeta = 0;
-  SmallVector<Constant*, 16> Metadata;
-  for (unsigned I = 0; I != Roots.size(); ++I) {
-    Constant *C = cast<Constant>(Roots[I].first->getArgOperand(1));
-    if (!C->isNullValue())
-      NumMeta = I + 1;
-    Metadata.push_back(ConstantExpr::getBitCast(C, VoidPtr));
-  }
-  Metadata.resize(NumMeta);
-
-  Type *Int32Ty = Type::getInt32Ty(F.getContext());
-  
-  Constant *BaseElts[] = {
-    ConstantInt::get(Int32Ty, Roots.size(), false),
-    ConstantInt::get(Int32Ty, NumMeta, false),
-  };
-
-  Constant *DescriptorElts[] = {
-    ConstantStruct::get(FrameMapTy, BaseElts),
-    ConstantArray::get(ArrayType::get(VoidPtr, NumMeta), Metadata)
-  };
-
-  Type *EltTys[] = { DescriptorElts[0]->getType(),DescriptorElts[1]->getType()};
-  StructType *STy = StructType::create(EltTys, "gc_map."+utostr(NumMeta));
-  
-  Constant *FrameMap = ConstantStruct::get(STy, DescriptorElts);
-
-  // FIXME: Is this actually dangerous as WritingAnLLVMPass.html claims? Seems
-  //        that, short of multithreaded LLVM, it should be safe; all that is
-  //        necessary is that a simple Module::iterator loop not be invalidated.
-  //        Appending to the GlobalVariable list is safe in that sense.
-  //
-  //        All of the output passes emit globals last. The ExecutionEngine
-  //        explicitly supports adding globals to the module after
-  //        initialization.
-  //
-  //        Still, if it isn't deemed acceptable, then this transformation needs
-  //        to be a ModulePass (which means it cannot be in the 'llc' pipeline
-  //        (which uses a FunctionPassManager (which segfaults (not asserts) if
-  //        provided a ModulePass))).
-  Constant *GV = new GlobalVariable(*F.getParent(), FrameMap->getType(), true,
-                                    GlobalVariable::InternalLinkage,
-                                    FrameMap, "__gc_" + F.getName());
-
-  Constant *GEPIndices[2] = {
-                          ConstantInt::get(Type::getInt32Ty(F.getContext()), 0),
-                          ConstantInt::get(Type::getInt32Ty(F.getContext()), 0)
-                          };
-  return ConstantExpr::getGetElementPtr(GV, GEPIndices);
-}
-
-Type* ShadowStackGC::GetConcreteStackEntryType(Function &F) {
-  // doInitialization creates the generic version of this type.
-  std::vector<Type*> EltTys;
-  EltTys.push_back(StackEntryTy);
-  for (size_t I = 0; I != Roots.size(); I++)
-    EltTys.push_back(Roots[I].second->getAllocatedType());
-  
-  return StructType::create(EltTys, "gc_stackentry."+F.getName().str());
-}
-
-/// doInitialization - If this module uses the GC intrinsics, find them now. If
-/// not, exit fast.
-bool ShadowStackGC::initializeCustomLowering(Module &M) {
-  // struct FrameMap {
-  //   int32_t NumRoots; // Number of roots in stack frame.
-  //   int32_t NumMeta;  // Number of metadata descriptors. May be < NumRoots.
-  //   void *Meta[];     // May be absent for roots without metadata.
-  // };
-  std::vector<Type*> EltTys;
-  // 32 bits is ok up to a 32GB stack frame. :)
-  EltTys.push_back(Type::getInt32Ty(M.getContext()));
-  // Specifies length of variable length array. 
-  EltTys.push_back(Type::getInt32Ty(M.getContext()));
-  FrameMapTy = StructType::create(EltTys, "gc_map");
-  PointerType *FrameMapPtrTy = PointerType::getUnqual(FrameMapTy);
-
-  // struct StackEntry {
-  //   ShadowStackEntry *Next; // Caller's stack entry.
-  //   FrameMap *Map;          // Pointer to constant FrameMap.
-  //   void *Roots[];          // Stack roots (in-place array, so we pretend).
-  // };
-  
-  StackEntryTy = StructType::create(M.getContext(), "gc_stackentry");
-  
-  EltTys.clear();
-  EltTys.push_back(PointerType::getUnqual(StackEntryTy));
-  EltTys.push_back(FrameMapPtrTy);
-  StackEntryTy->setBody(EltTys);
-  PointerType *StackEntryPtrTy = PointerType::getUnqual(StackEntryTy);
-
-  // Get the root chain if it already exists.
-  Head = M.getGlobalVariable("llvm_gc_root_chain");
-  if (!Head) {
-    // If the root chain does not exist, insert a new one with linkonce
-    // linkage!
-    Head = new GlobalVariable(M, StackEntryPtrTy, false,
-                              GlobalValue::LinkOnceAnyLinkage,
-                              Constant::getNullValue(StackEntryPtrTy),
-                              "llvm_gc_root_chain");
-  } else if (Head->hasExternalLinkage() && Head->isDeclaration()) {
-    Head->setInitializer(Constant::getNullValue(StackEntryPtrTy));
-    Head->setLinkage(GlobalValue::LinkOnceAnyLinkage);
-  }
-
-  return true;
-}
-
-bool ShadowStackGC::IsNullValue(Value *V) {
-  if (Constant *C = dyn_cast<Constant>(V))
-    return C->isNullValue();
-  return false;
-}
-
-void ShadowStackGC::CollectRoots(Function &F) {
-  // FIXME: Account for original alignment. Could fragment the root array.
-  //   Approach 1: Null initialize empty slots at runtime. Yuck.
-  //   Approach 2: Emit a map of the array instead of just a count.
-
-  assert(Roots.empty() && "Not cleaned up?");
-
-  SmallVector<std::pair<CallInst*, AllocaInst*>, 16> MetaRoots;
-
-  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
-    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;)
-      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++))
-        if (Function *F = CI->getCalledFunction())
-          if (F->getIntrinsicID() == Intrinsic::gcroot) {
-            std::pair<CallInst*, AllocaInst*> Pair = std::make_pair(
-              CI, cast<AllocaInst>(CI->getArgOperand(0)->stripPointerCasts()));
-            if (IsNullValue(CI->getArgOperand(1)))
-              Roots.push_back(Pair);
-            else
-              MetaRoots.push_back(Pair);
-          }
-
-  // Number roots with metadata (usually empty) at the beginning, so that the
-  // FrameMap::Meta array can be elided.
-  Roots.insert(Roots.begin(), MetaRoots.begin(), MetaRoots.end());
-}
-
-GetElementPtrInst *
-ShadowStackGC::CreateGEP(LLVMContext &Context, IRBuilder<> &B, Value *BasePtr,
-                         int Idx, int Idx2, const char *Name) {
-  Value *Indices[] = { ConstantInt::get(Type::getInt32Ty(Context), 0),
-                       ConstantInt::get(Type::getInt32Ty(Context), Idx),
-                       ConstantInt::get(Type::getInt32Ty(Context), Idx2) };
-  Value* Val = B.CreateGEP(BasePtr, Indices, Name);
-
-  assert(isa<GetElementPtrInst>(Val) && "Unexpected folded constant");
-
-  return dyn_cast<GetElementPtrInst>(Val);
-}
-
-GetElementPtrInst *
-ShadowStackGC::CreateGEP(LLVMContext &Context, IRBuilder<> &B, Value *BasePtr,
-                         int Idx, const char *Name) {
-  Value *Indices[] = { ConstantInt::get(Type::getInt32Ty(Context), 0),
-                       ConstantInt::get(Type::getInt32Ty(Context), Idx) };
-  Value *Val = B.CreateGEP(BasePtr, Indices, Name);
-
-  assert(isa<GetElementPtrInst>(Val) && "Unexpected folded constant");
-
-  return dyn_cast<GetElementPtrInst>(Val);
-}
-
-/// runOnFunction - Insert code to maintain the shadow stack.
-bool ShadowStackGC::performCustomLowering(Function &F) {
-  LLVMContext &Context = F.getContext();
-  
-  // Find calls to llvm.gcroot.
-  CollectRoots(F);
-
-  // If there are no roots in this function, then there is no need to add a
-  // stack map entry for it.
-  if (Roots.empty())
-    return false;
-
-  // Build the constant map and figure the type of the shadow stack entry.
-  Value *FrameMap = GetFrameMap(F);
-  Type *ConcreteStackEntryTy = GetConcreteStackEntryType(F);
-
-  // Build the shadow stack entry at the very start of the function.
-  BasicBlock::iterator IP = F.getEntryBlock().begin();
-  IRBuilder<> AtEntry(IP->getParent(), IP);
-
-  Instruction *StackEntry = AtEntry.CreateAlloca(ConcreteStackEntryTy, nullptr,
-                                                 "gc_frame");
-
-  while (isa<AllocaInst>(IP)) ++IP;
-  AtEntry.SetInsertPoint(IP->getParent(), IP);
-
-  // Initialize the map pointer and load the current head of the shadow stack.
-  Instruction *CurrentHead  = AtEntry.CreateLoad(Head, "gc_currhead");
-  Instruction *EntryMapPtr  = CreateGEP(Context, AtEntry, StackEntry,
-                                        0,1,"gc_frame.map");
-  AtEntry.CreateStore(FrameMap, EntryMapPtr);
-
-  // After all the allocas...
-  for (unsigned I = 0, E = Roots.size(); I != E; ++I) {
-    // For each root, find the corresponding slot in the aggregate...
-    Value *SlotPtr = CreateGEP(Context, AtEntry, StackEntry, 1 + I, "gc_root");
-
-    // And use it in lieu of the alloca.
-    AllocaInst *OriginalAlloca = Roots[I].second;
-    SlotPtr->takeName(OriginalAlloca);
-    OriginalAlloca->replaceAllUsesWith(SlotPtr);
-  }
-
-  // Move past the original stores inserted by GCStrategy::InitRoots. This isn't
-  // really necessary (the collector would never see the intermediate state at
-  // runtime), but it's nicer not to push the half-initialized entry onto the
-  // shadow stack.
-  while (isa<StoreInst>(IP)) ++IP;
-  AtEntry.SetInsertPoint(IP->getParent(), IP);
-
-  // Push the entry onto the shadow stack.
-  Instruction *EntryNextPtr = CreateGEP(Context, AtEntry,
-                                        StackEntry,0,0,"gc_frame.next");
-  Instruction *NewHeadVal   = CreateGEP(Context, AtEntry, 
-                                        StackEntry, 0, "gc_newhead");
-  AtEntry.CreateStore(CurrentHead, EntryNextPtr);
-  AtEntry.CreateStore(NewHeadVal, Head);
-
-  // For each instruction that escapes...
-  EscapeEnumerator EE(F, "gc_cleanup");
-  while (IRBuilder<> *AtExit = EE.Next()) {
-    // Pop the entry from the shadow stack. Don't reuse CurrentHead from
-    // AtEntry, since that would make the value live for the entire function.
-    Instruction *EntryNextPtr2 = CreateGEP(Context, *AtExit, StackEntry, 0, 0,
-                                           "gc_frame.next");
-    Value *SavedHead = AtExit->CreateLoad(EntryNextPtr2, "gc_savedhead");
-                       AtExit->CreateStore(SavedHead, Head);
-  }
-
-  // Delete the original allocas (which are no longer used) and the intrinsic
-  // calls (which are no longer valid). Doing this last avoids invalidating
-  // iterators.
-  for (unsigned I = 0, E = Roots.size(); I != E; ++I) {
-    Roots[I].first->eraseFromParent();
-    Roots[I].second->eraseFromParent();
-  }
-
-  Roots.clear();
-  return true;
-}

diff --git a/lib/CodeGen/ShadowStackGCLowering.cpp b/lib/CodeGen/ShadowStackGCLowering.cpp
new file mode 100644
index 0000000..f6393a5
--- /dev/null
+++ b/lib/CodeGen/ShadowStackGCLowering.cpp

@@ -0,0 +1,457 @@
+//===-- ShadowStackGCLowering.cpp - Custom lowering for shadow-stack gc ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom lowering code required by the shadow-stack GC
+// strategy.  
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "shadowstackgclowering"
+
+namespace {
+
+class ShadowStackGCLowering : public FunctionPass {
+  /// RootChain - This is the global linked-list that contains the chain of GC
+  /// roots.
+  GlobalVariable *Head;
+
+  /// StackEntryTy - Abstract type of a link in the shadow stack.
+  ///
+  StructType *StackEntryTy;
+  StructType *FrameMapTy;
+
+  /// Roots - GC roots in the current function. Each is a pair of the
+  /// intrinsic call and its corresponding alloca.
+  std::vector<std::pair<CallInst *, AllocaInst *>> Roots;
+
+public:
+  static char ID;
+  ShadowStackGCLowering();
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+private:
+  bool IsNullValue(Value *V);
+  Constant *GetFrameMap(Function &F);
+  Type *GetConcreteStackEntryType(Function &F);
+  void CollectRoots(Function &F);
+  static GetElementPtrInst *CreateGEP(LLVMContext &Context, IRBuilder<> &B,
+                                      Value *BasePtr, int Idx1,
+                                      const char *Name);
+  static GetElementPtrInst *CreateGEP(LLVMContext &Context, IRBuilder<> &B,
+                                      Value *BasePtr, int Idx1, int Idx2,
+                                      const char *Name);
+};
+}
+
+INITIALIZE_PASS_BEGIN(ShadowStackGCLowering, "shadow-stack-gc-lowering",
+                      "Shadow Stack GC Lowering", false, false)
+INITIALIZE_PASS_DEPENDENCY(GCModuleInfo)
+INITIALIZE_PASS_END(ShadowStackGCLowering, "shadow-stack-gc-lowering",
+                    "Shadow Stack GC Lowering", false, false)
+
+FunctionPass *llvm::createShadowStackGCLoweringPass() { return new ShadowStackGCLowering(); }
+
+char ShadowStackGCLowering::ID = 0;
+
+ShadowStackGCLowering::ShadowStackGCLowering()
+  : FunctionPass(ID), Head(nullptr), StackEntryTy(nullptr),
+    FrameMapTy(nullptr) {
+  initializeShadowStackGCLoweringPass(*PassRegistry::getPassRegistry());
+}
+
+namespace {
+/// EscapeEnumerator - This is a little algorithm to find all escape points
+/// from a function so that "finally"-style code can be inserted. In addition
+/// to finding the existing return and unwind instructions, it also (if
+/// necessary) transforms any call instructions into invokes and sends them to
+/// a landing pad.
+///
+/// It's wrapped up in a state machine using the same transform C# uses for
+/// 'yield return' enumerators, This transform allows it to be non-allocating.
+class EscapeEnumerator {
+  Function &F;
+  const char *CleanupBBName;
+
+  // State.
+  int State;
+  Function::iterator StateBB, StateE;
+  IRBuilder<> Builder;
+
+public:
+  EscapeEnumerator(Function &F, const char *N = "cleanup")
+      : F(F), CleanupBBName(N), State(0), Builder(F.getContext()) {}
+
+  IRBuilder<> *Next() {
+    switch (State) {
+    default:
+      return nullptr;
+
+    case 0:
+      StateBB = F.begin();
+      StateE = F.end();
+      State = 1;
+
+    case 1:
+      // Find all 'return', 'resume', and 'unwind' instructions.
+      while (StateBB != StateE) {
+        BasicBlock *CurBB = StateBB++;
+
+        // Branches and invokes do not escape, only unwind, resume, and return
+        // do.
+        TerminatorInst *TI = CurBB->getTerminator();
+        if (!isa<ReturnInst>(TI) && !isa<ResumeInst>(TI))
+          continue;
+
+        Builder.SetInsertPoint(TI->getParent(), TI);
+        return &Builder;
+      }
+
+      State = 2;
+
+      // Find all 'call' instructions.
+      SmallVector<Instruction *, 16> Calls;
+      for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+        for (BasicBlock::iterator II = BB->begin(), EE = BB->end(); II != EE;
+             ++II)
+          if (CallInst *CI = dyn_cast<CallInst>(II))
+            if (!CI->getCalledFunction() ||
+                !CI->getCalledFunction()->getIntrinsicID())
+              Calls.push_back(CI);
+
+      if (Calls.empty())
+        return nullptr;
+
+      // Create a cleanup block.
+      LLVMContext &C = F.getContext();
+      BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
+      Type *ExnTy =
+          StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C), nullptr);
+      Constant *PersFn = F.getParent()->getOrInsertFunction(
+          "__gcc_personality_v0", FunctionType::get(Type::getInt32Ty(C), true));
+      LandingPadInst *LPad =
+          LandingPadInst::Create(ExnTy, PersFn, 1, "cleanup.lpad", CleanupBB);
+      LPad->setCleanup(true);
+      ResumeInst *RI = ResumeInst::Create(LPad, CleanupBB);
+
+      // Transform the 'call' instructions into 'invoke's branching to the
+      // cleanup block. Go in reverse order to make prettier BB names.
+      SmallVector<Value *, 16> Args;
+      for (unsigned I = Calls.size(); I != 0;) {
+        CallInst *CI = cast<CallInst>(Calls[--I]);
+
+        // Split the basic block containing the function call.
+        BasicBlock *CallBB = CI->getParent();
+        BasicBlock *NewBB =
+            CallBB->splitBasicBlock(CI, CallBB->getName() + ".cont");
+
+        // Remove the unconditional branch inserted at the end of CallBB.
+        CallBB->getInstList().pop_back();
+        NewBB->getInstList().remove(CI);
+
+        // Create a new invoke instruction.
+        Args.clear();
+        CallSite CS(CI);
+        Args.append(CS.arg_begin(), CS.arg_end());
+
+        InvokeInst *II =
+            InvokeInst::Create(CI->getCalledValue(), NewBB, CleanupBB, Args,
+                               CI->getName(), CallBB);
+        II->setCallingConv(CI->getCallingConv());
+        II->setAttributes(CI->getAttributes());
+        CI->replaceAllUsesWith(II);
+        delete CI;
+      }
+
+      Builder.SetInsertPoint(RI->getParent(), RI);
+      return &Builder;
+    }
+  }
+};
+}
+
+
+Constant *ShadowStackGCLowering::GetFrameMap(Function &F) {
+  // doInitialization creates the abstract type of this value.
+  Type *VoidPtr = Type::getInt8PtrTy(F.getContext());
+
+  // Truncate the ShadowStackDescriptor if some metadata is null.
+  unsigned NumMeta = 0;
+  SmallVector<Constant *, 16> Metadata;
+  for (unsigned I = 0; I != Roots.size(); ++I) {
+    Constant *C = cast<Constant>(Roots[I].first->getArgOperand(1));
+    if (!C->isNullValue())
+      NumMeta = I + 1;
+    Metadata.push_back(ConstantExpr::getBitCast(C, VoidPtr));
+  }
+  Metadata.resize(NumMeta);
+
+  Type *Int32Ty = Type::getInt32Ty(F.getContext());
+
+  Constant *BaseElts[] = {
+      ConstantInt::get(Int32Ty, Roots.size(), false),
+      ConstantInt::get(Int32Ty, NumMeta, false),
+  };
+
+  Constant *DescriptorElts[] = {
+      ConstantStruct::get(FrameMapTy, BaseElts),
+      ConstantArray::get(ArrayType::get(VoidPtr, NumMeta), Metadata)};
+
+  Type *EltTys[] = {DescriptorElts[0]->getType(), DescriptorElts[1]->getType()};
+  StructType *STy = StructType::create(EltTys, "gc_map." + utostr(NumMeta));
+
+  Constant *FrameMap = ConstantStruct::get(STy, DescriptorElts);
+
+  // FIXME: Is this actually dangerous as WritingAnLLVMPass.html claims? Seems
+  //        that, short of multithreaded LLVM, it should be safe; all that is
+  //        necessary is that a simple Module::iterator loop not be invalidated.
+  //        Appending to the GlobalVariable list is safe in that sense.
+  //
+  //        All of the output passes emit globals last. The ExecutionEngine
+  //        explicitly supports adding globals to the module after
+  //        initialization.
+  //
+  //        Still, if it isn't deemed acceptable, then this transformation needs
+  //        to be a ModulePass (which means it cannot be in the 'llc' pipeline
+  //        (which uses a FunctionPassManager (which segfaults (not asserts) if
+  //        provided a ModulePass))).
+  Constant *GV = new GlobalVariable(*F.getParent(), FrameMap->getType(), true,
+                                    GlobalVariable::InternalLinkage, FrameMap,
+                                    "__gc_" + F.getName());
+
+  Constant *GEPIndices[2] = {
+      ConstantInt::get(Type::getInt32Ty(F.getContext()), 0),
+      ConstantInt::get(Type::getInt32Ty(F.getContext()), 0)};
+  return ConstantExpr::getGetElementPtr(GV, GEPIndices);
+}
+
+Type *ShadowStackGCLowering::GetConcreteStackEntryType(Function &F) {
+  // doInitialization creates the generic version of this type.
+  std::vector<Type *> EltTys;
+  EltTys.push_back(StackEntryTy);
+  for (size_t I = 0; I != Roots.size(); I++)
+    EltTys.push_back(Roots[I].second->getAllocatedType());
+
+  return StructType::create(EltTys, "gc_stackentry." + F.getName().str());
+}
+
+/// doInitialization - If this module uses the GC intrinsics, find them now. If
+/// not, exit fast.
+bool ShadowStackGCLowering::doInitialization(Module &M) {
+  bool Active = false;
+  for (Function &F : M) {
+    if (F.hasGC() && F.getGC() == std::string("shadow-stack")) {
+      Active = true;
+      break;
+    }
+  }
+  if (!Active)
+    return false;
+  
+  // struct FrameMap {
+  //   int32_t NumRoots; // Number of roots in stack frame.
+  //   int32_t NumMeta;  // Number of metadata descriptors. May be < NumRoots.
+  //   void *Meta[];     // May be absent for roots without metadata.
+  // };
+  std::vector<Type *> EltTys;
+  // 32 bits is ok up to a 32GB stack frame. :)
+  EltTys.push_back(Type::getInt32Ty(M.getContext()));
+  // Specifies length of variable length array.
+  EltTys.push_back(Type::getInt32Ty(M.getContext()));
+  FrameMapTy = StructType::create(EltTys, "gc_map");
+  PointerType *FrameMapPtrTy = PointerType::getUnqual(FrameMapTy);
+
+  // struct StackEntry {
+  //   ShadowStackEntry *Next; // Caller's stack entry.
+  //   FrameMap *Map;          // Pointer to constant FrameMap.
+  //   void *Roots[];          // Stack roots (in-place array, so we pretend).
+  // };
+
+  StackEntryTy = StructType::create(M.getContext(), "gc_stackentry");
+
+  EltTys.clear();
+  EltTys.push_back(PointerType::getUnqual(StackEntryTy));
+  EltTys.push_back(FrameMapPtrTy);
+  StackEntryTy->setBody(EltTys);
+  PointerType *StackEntryPtrTy = PointerType::getUnqual(StackEntryTy);
+
+  // Get the root chain if it already exists.
+  Head = M.getGlobalVariable("llvm_gc_root_chain");
+  if (!Head) {
+    // If the root chain does not exist, insert a new one with linkonce
+    // linkage!
+    Head = new GlobalVariable(
+        M, StackEntryPtrTy, false, GlobalValue::LinkOnceAnyLinkage,
+        Constant::getNullValue(StackEntryPtrTy), "llvm_gc_root_chain");
+  } else if (Head->hasExternalLinkage() && Head->isDeclaration()) {
+    Head->setInitializer(Constant::getNullValue(StackEntryPtrTy));
+    Head->setLinkage(GlobalValue::LinkOnceAnyLinkage);
+  }
+
+  return true;
+}
+
+bool ShadowStackGCLowering::IsNullValue(Value *V) {
+  if (Constant *C = dyn_cast<Constant>(V))
+    return C->isNullValue();
+  return false;
+}
+
+void ShadowStackGCLowering::CollectRoots(Function &F) {
+  // FIXME: Account for original alignment. Could fragment the root array.
+  //   Approach 1: Null initialize empty slots at runtime. Yuck.
+  //   Approach 2: Emit a map of the array instead of just a count.
+
+  assert(Roots.empty() && "Not cleaned up?");
+
+  SmallVector<std::pair<CallInst *, AllocaInst *>, 16> MetaRoots;
+
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;)
+      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++))
+        if (Function *F = CI->getCalledFunction())
+          if (F->getIntrinsicID() == Intrinsic::gcroot) {
+            std::pair<CallInst *, AllocaInst *> Pair = std::make_pair(
+                CI,
+                cast<AllocaInst>(CI->getArgOperand(0)->stripPointerCasts()));
+            if (IsNullValue(CI->getArgOperand(1)))
+              Roots.push_back(Pair);
+            else
+              MetaRoots.push_back(Pair);
+          }
+
+  // Number roots with metadata (usually empty) at the beginning, so that the
+  // FrameMap::Meta array can be elided.
+  Roots.insert(Roots.begin(), MetaRoots.begin(), MetaRoots.end());
+}
+
+GetElementPtrInst *ShadowStackGCLowering::CreateGEP(LLVMContext &Context,
+                                            IRBuilder<> &B, Value *BasePtr,
+                                            int Idx, int Idx2,
+                                            const char *Name) {
+  Value *Indices[] = {ConstantInt::get(Type::getInt32Ty(Context), 0),
+                      ConstantInt::get(Type::getInt32Ty(Context), Idx),
+                      ConstantInt::get(Type::getInt32Ty(Context), Idx2)};
+  Value *Val = B.CreateGEP(BasePtr, Indices, Name);
+
+  assert(isa<GetElementPtrInst>(Val) && "Unexpected folded constant");
+
+  return dyn_cast<GetElementPtrInst>(Val);
+}
+
+GetElementPtrInst *ShadowStackGCLowering::CreateGEP(LLVMContext &Context,
+                                            IRBuilder<> &B, Value *BasePtr,
+                                            int Idx, const char *Name) {
+  Value *Indices[] = {ConstantInt::get(Type::getInt32Ty(Context), 0),
+                      ConstantInt::get(Type::getInt32Ty(Context), Idx)};
+  Value *Val = B.CreateGEP(BasePtr, Indices, Name);
+
+  assert(isa<GetElementPtrInst>(Val) && "Unexpected folded constant");
+
+  return dyn_cast<GetElementPtrInst>(Val);
+}
+
+/// runOnFunction - Insert code to maintain the shadow stack.
+bool ShadowStackGCLowering::runOnFunction(Function &F) {
+  // Quick exit for functions that do not use the shadow stack GC.
+  if (!F.hasGC() ||
+      F.getGC() != std::string("shadow-stack"))
+    return false;
+  
+  LLVMContext &Context = F.getContext();
+
+  // Find calls to llvm.gcroot.
+  CollectRoots(F);
+
+  // If there are no roots in this function, then there is no need to add a
+  // stack map entry for it.
+  if (Roots.empty())
+    return false;
+
+  // Build the constant map and figure the type of the shadow stack entry.
+  Value *FrameMap = GetFrameMap(F);
+  Type *ConcreteStackEntryTy = GetConcreteStackEntryType(F);
+
+  // Build the shadow stack entry at the very start of the function.
+  BasicBlock::iterator IP = F.getEntryBlock().begin();
+  IRBuilder<> AtEntry(IP->getParent(), IP);
+
+  Instruction *StackEntry =
+      AtEntry.CreateAlloca(ConcreteStackEntryTy, nullptr, "gc_frame");
+
+  while (isa<AllocaInst>(IP))
+    ++IP;
+  AtEntry.SetInsertPoint(IP->getParent(), IP);
+
+  // Initialize the map pointer and load the current head of the shadow stack.
+  Instruction *CurrentHead = AtEntry.CreateLoad(Head, "gc_currhead");
+  Instruction *EntryMapPtr =
+      CreateGEP(Context, AtEntry, StackEntry, 0, 1, "gc_frame.map");
+  AtEntry.CreateStore(FrameMap, EntryMapPtr);
+
+  // After all the allocas...
+  for (unsigned I = 0, E = Roots.size(); I != E; ++I) {
+    // For each root, find the corresponding slot in the aggregate...
+    Value *SlotPtr = CreateGEP(Context, AtEntry, StackEntry, 1 + I, "gc_root");
+
+    // And use it in lieu of the alloca.
+    AllocaInst *OriginalAlloca = Roots[I].second;
+    SlotPtr->takeName(OriginalAlloca);
+    OriginalAlloca->replaceAllUsesWith(SlotPtr);
+  }
+
+  // Move past the original stores inserted by GCStrategy::InitRoots. This isn't
+  // really necessary (the collector would never see the intermediate state at
+  // runtime), but it's nicer not to push the half-initialized entry onto the
+  // shadow stack.
+  while (isa<StoreInst>(IP))
+    ++IP;
+  AtEntry.SetInsertPoint(IP->getParent(), IP);
+
+  // Push the entry onto the shadow stack.
+  Instruction *EntryNextPtr =
+      CreateGEP(Context, AtEntry, StackEntry, 0, 0, "gc_frame.next");
+  Instruction *NewHeadVal =
+      CreateGEP(Context, AtEntry, StackEntry, 0, "gc_newhead");
+  AtEntry.CreateStore(CurrentHead, EntryNextPtr);
+  AtEntry.CreateStore(NewHeadVal, Head);
+
+  // For each instruction that escapes...
+  EscapeEnumerator EE(F, "gc_cleanup");
+  while (IRBuilder<> *AtExit = EE.Next()) {
+    // Pop the entry from the shadow stack. Don't reuse CurrentHead from
+    // AtEntry, since that would make the value live for the entire function.
+    Instruction *EntryNextPtr2 =
+        CreateGEP(Context, *AtExit, StackEntry, 0, 0, "gc_frame.next");
+    Value *SavedHead = AtExit->CreateLoad(EntryNextPtr2, "gc_savedhead");
+    AtExit->CreateStore(SavedHead, Head);
+  }
+
+  // Delete the original allocas (which are no longer used) and the intrinsic
+  // calls (which are no longer valid). Doing this last avoids invalidating
+  // iterators.
+  for (unsigned I = 0, E = Roots.size(); I != E; ++I) {
+    Roots[I].first->eraseFromParent();
+    Roots[I].second->eraseFromParent();
+  }
+
+  Roots.clear();
+  return true;
+}

diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 7fd8107..35e4292 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp

@@ -191,7 +191,7 @@
   // Create an alloca for the incoming jump buffer ptr and the new jump buffer
   // that needs to be restored on all exits from the function. This is an alloca
   // because the value needs to be added to the global context list.
-  const TargetLowering *TLI = TM->getSubtargetImpl()->getTargetLowering();
+  const TargetLowering *TLI = TM->getSubtargetImpl(F)->getTargetLowering();
   unsigned Align =
       TLI->getDataLayout()->getPrefTypeAlignment(FunctionContextTy);
   FuncCtx = new AllocaInst(FunctionContextTy, nullptr, Align, "fn_context",

diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index ea7b914..dab1dfe 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp

@@ -120,10 +120,9 @@
 
   // First get all the defs from the interval values. This provides the correct
   // slots for early clobbers.
-  for (LiveInterval::const_vni_iterator I = CurLI->vni_begin(),
-       E = CurLI->vni_end(); I != E; ++I)
-    if (!(*I)->isPHIDef() && !(*I)->isUnused())
-      UseSlots.push_back((*I)->def);
+  for (const VNInfo *VNI : CurLI->valnos)
+    if (!VNI->isPHIDef() && !VNI->isUnused())
+      UseSlots.push_back(VNI->def);
 
   // Get use slots form the use-def chain.
   const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -624,8 +623,7 @@
   AssignI.setMap(RegAssign);
 
   for (unsigned i = 0, e = Copies.size(); i != e; ++i) {
-    VNInfo *VNI = Copies[i];
-    SlotIndex Def = VNI->def;
+    SlotIndex Def = Copies[i]->def;
     MachineInstr *MI = LIS.getInstructionFromIndex(Def);
     assert(MI && "No instruction for back-copy");
 
@@ -636,13 +634,12 @@
     while (!AtBegin && (--MBBI)->isDebugValue());
 
     DEBUG(dbgs() << "Removing " << Def << '\t' << *MI);
-    LI->removeValNo(VNI);
+    LIS.removeVRegDefAt(*LI, Def);
     LIS.RemoveMachineInstrFromMaps(MI);
     MI->eraseFromParent();
 
-    // Adjust RegAssign if a register assignment is killed at VNI->def.  We
-    // want to avoid calculating the live range of the source register if
-    // possible.
+    // Adjust RegAssign if a register assignment is killed at Def. We want to
+    // avoid calculating the live range of the source register if possible.
     AssignI.find(Def.getPrevSlot());
     if (!AssignI.valid() || AssignI.start() >= Def)
       continue;
@@ -727,9 +724,7 @@
 
   // Find the nearest common dominator for parent values with multiple
   // back-copies.  If a single back-copy dominates, put it in DomPair.second.
-  for (LiveInterval::vni_iterator VI = LI->vni_begin(), VE = LI->vni_end();
-       VI != VE; ++VI) {
-    VNInfo *VNI = *VI;
+  for (VNInfo *VNI : LI->valnos) {
     if (VNI->isUnused())
       continue;
     VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
@@ -802,9 +797,7 @@
   // Remove redundant back-copies that are now known to be dominated by another
   // def with the same value.
   SmallVector<VNInfo*, 8> BackCopies;
-  for (LiveInterval::vni_iterator VI = LI->vni_begin(), VE = LI->vni_end();
-       VI != VE; ++VI) {
-    VNInfo *VNI = *VI;
+  for (VNInfo *VNI : LI->valnos) {
     if (VNI->isUnused())
       continue;
     VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
@@ -823,16 +816,15 @@
 bool SplitEditor::transferValues() {
   bool Skipped = false;
   RegAssignMap::const_iterator AssignI = RegAssign.begin();
-  for (LiveInterval::const_iterator ParentI = Edit->getParent().begin(),
-         ParentE = Edit->getParent().end(); ParentI != ParentE; ++ParentI) {
-    DEBUG(dbgs() << "  blit " << *ParentI << ':');
-    VNInfo *ParentVNI = ParentI->valno;
+  for (const LiveRange::Segment &S : Edit->getParent()) {
+    DEBUG(dbgs() << "  blit " << S << ':');
+    VNInfo *ParentVNI = S.valno;
     // RegAssign has holes where RegIdx 0 should be used.
-    SlotIndex Start = ParentI->start;
+    SlotIndex Start = S.start;
     AssignI.advanceTo(Start);
     do {
       unsigned RegIdx;
-      SlotIndex End = ParentI->end;
+      SlotIndex End = S.end;
       if (!AssignI.valid()) {
         RegIdx = 0;
       } else if (AssignI.start() <= Start) {
@@ -917,7 +909,7 @@
         ++MBB;
       }
       Start = End;
-    } while (Start != ParentI->end);
+    } while (Start != S.end);
     DEBUG(dbgs() << '\n');
   }
 
@@ -930,9 +922,7 @@
 
 void SplitEditor::extendPHIKillRanges() {
     // Extend live ranges to be live-out for successor PHI values.
-  for (LiveInterval::const_vni_iterator I = Edit->getParent().vni_begin(),
-       E = Edit->getParent().vni_end(); I != E; ++I) {
-    const VNInfo *PHIVNI = *I;
+  for (const VNInfo *PHIVNI : Edit->getParent().valnos) {
     if (PHIVNI->isUnused() || !PHIVNI->isPHIDef())
       continue;
     unsigned RegIdx = RegAssign.lookup(PHIVNI->def);
@@ -1006,12 +996,11 @@
   SmallVector<MachineInstr*, 8> Dead;
   for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I){
     LiveInterval *LI = &LIS.getInterval(*I);
-    for (LiveInterval::const_iterator LII = LI->begin(), LIE = LI->end();
-           LII != LIE; ++LII) {
+    for (const LiveRange::Segment &S : LI->segments) {
       // Dead defs end at the dead slot.
-      if (LII->end != LII->valno->def.getDeadSlot())
+      if (S.end != S.valno->def.getDeadSlot())
         continue;
-      MachineInstr *MI = LIS.getInstructionFromIndex(LII->valno->def);
+      MachineInstr *MI = LIS.getInstructionFromIndex(S.valno->def);
       assert(MI && "Missing instruction for dead def");
       MI->addRegisterDead(LI->reg, &TRI);
 
@@ -1036,9 +1025,7 @@
   // the inserted copies.
 
   // Add the original defs from the parent interval.
-  for (LiveInterval::const_vni_iterator I = Edit->getParent().vni_begin(),
-         E = Edit->getParent().vni_end(); I != E; ++I) {
-    const VNInfo *ParentVNI = *I;
+  for (const VNInfo *ParentVNI : Edit->getParent().valnos) {
     if (ParentVNI->isUnused())
       continue;
     unsigned RegIdx = RegAssign.lookup(ParentVNI->def);

diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
index dcf1b44..faf94b6 100644
--- a/lib/CodeGen/StackColoring.cpp
+++ b/lib/CodeGen/StackColoring.cpp

@@ -463,7 +463,8 @@
     if (!VI.Var)
       continue;
     if (SlotRemap.count(VI.Slot)) {
-      DEBUG(dbgs()<<"Remapping debug info for ["<<VI.Var->getName()<<"].\n");
+      DEBUG(dbgs() << "Remapping debug info for ["
+                   << DIVariable(VI.Var).getName() << "].\n");
       VI.Slot = SlotRemap[VI.Slot];
       FixedDbg++;
     }

diff --git a/lib/CodeGen/StackMapLivenessAnalysis.cpp b/lib/CodeGen/StackMapLivenessAnalysis.cpp
index c2ee87a..767f43a 100644
--- a/lib/CodeGen/StackMapLivenessAnalysis.cpp
+++ b/lib/CodeGen/StackMapLivenessAnalysis.cpp

@@ -123,5 +123,7 @@
   for (LivePhysRegs::const_iterator RI = LiveRegs.begin(), RE = LiveRegs.end();
        RI != RE; ++RI)
     Mask[*RI / 32] |= 1U << (*RI % 32);
+
+  TRI->adjustStackMapLiveOutMask(Mask);
   return Mask;
 }

diff --git a/lib/CodeGen/StackMaps.cpp b/lib/CodeGen/StackMaps.cpp
index d3791c3..5d46419 100644
--- a/lib/CodeGen/StackMaps.cpp
+++ b/lib/CodeGen/StackMaps.cpp

@@ -84,8 +84,7 @@
     switch (MOI->getImm()) {
     default: llvm_unreachable("Unrecognized operand type.");
     case StackMaps::DirectMemRefOp: {
-      unsigned Size =
-          AP.TM.getSubtargetImpl()->getDataLayout()->getPointerSizeInBits();
+      unsigned Size = AP.TM.getDataLayout()->getPointerSizeInBits();
       assert((Size % 8) == 0 && "Need pointer size in bytes.");
       Size /= 8;
       unsigned Reg = (++MOI)->getReg();
@@ -241,7 +240,7 @@
   // entry.
   const MCExpr *CSOffsetExpr = MCBinaryExpr::CreateSub(
     MCSymbolRefExpr::Create(MILabel, OutContext),
-    MCSymbolRefExpr::Create(AP.CurrentFnSym, OutContext),
+    MCSymbolRefExpr::Create(AP.CurrentFnSymForSize, OutContext),
     OutContext);
 
   CSInfos.emplace_back(CSOffsetExpr, ID, std::move(Locations),
@@ -286,6 +285,18 @@
   }
 #endif
 }
+void StackMaps::recordStatepoint(const MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::STATEPOINT &&
+         "expected statepoint");
+
+  StatepointOpers opers(&MI);
+  // Record all the deopt and gc operands (they're contiguous and run from the
+  // initial index to the end of the operand list)
+  const unsigned StartIdx = opers.getVarIdx();
+  recordStackMapOpers(MI, 0xABCDEF00,
+                      MI.operands_begin() + StartIdx, MI.operands_end(),
+                      false);
+}
 
 /// Emit the stackmap header.
 ///

diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index 45f97ac..d1fae95 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp

@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/Passes.h"
@@ -31,6 +32,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
@@ -86,10 +88,9 @@
   DominatorTreeWrapperPass *DTWP =
       getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  TLI = TM->getSubtargetImpl()->getTargetLowering();
+  TLI = TM->getSubtargetImpl(Fn)->getTargetLowering();
 
-  Attribute Attr = Fn.getAttributes().getAttribute(
-      AttributeSet::FunctionIndex, "stack-protector-buffer-size");
+  Attribute Attr = Fn.getFnAttribute("stack-protector-buffer-size");
   if (Attr.isStringAttribute() &&
       Attr.getValueAsString().getAsInteger(10, SSPBufferSize))
       return false; // Invalid integer string
@@ -199,31 +200,24 @@
 bool StackProtector::RequiresStackProtector() {
   bool Strong = false;
   bool NeedsProtector = false;
-  if (F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                      Attribute::StackProtectReq)) {
+  if (F->hasFnAttribute(Attribute::StackProtectReq)) {
     NeedsProtector = true;
     Strong = true; // Use the same heuristic as strong to determine SSPLayout
-  } else if (F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                             Attribute::StackProtectStrong))
+  } else if (F->hasFnAttribute(Attribute::StackProtectStrong))
     Strong = true;
-  else if (!F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                            Attribute::StackProtect))
+  else if (!F->hasFnAttribute(Attribute::StackProtect))
     return false;
 
-  for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
-    BasicBlock *BB = I;
-
-    for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;
-         ++II) {
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+  for (const BasicBlock &BB : *F) {
+    for (const Instruction &I : BB) {
+      if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
         if (AI->isArrayAllocation()) {
           // SSP-Strong: Enable protectors for any call to alloca, regardless
           // of size.
           if (Strong)
             return true;
 
-          if (const ConstantInt *CI =
-                  dyn_cast<ConstantInt>(AI->getArraySize())) {
+          if (const auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) {
             if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize) {
               // A call to alloca with size >= SSPBufferSize requires
               // stack protectors.
@@ -335,7 +329,7 @@
 /// Returns true if the platform/triple supports the stackprotectorcreate pseudo
 /// node.
 static bool CreatePrologue(Function *F, Module *M, ReturnInst *RI,
-                           const TargetLoweringBase *TLI, const Triple &Trip,
+                           const TargetLoweringBase *TLI, const Triple &TT,
                            AllocaInst *&AI, Value *&StackGuardVar) {
   bool SupportsSelectionDAGSP = false;
   PointerType *PtrTy = Type::getInt8PtrTy(RI->getContext());
@@ -344,9 +338,10 @@
     Constant *OffsetVal =
         ConstantInt::get(Type::getInt32Ty(RI->getContext()), Offset);
 
-    StackGuardVar = ConstantExpr::getIntToPtr(
-        OffsetVal, PointerType::get(PtrTy, AddressSpace));
-  } else if (Trip.getOS() == llvm::Triple::OpenBSD) {
+    StackGuardVar =
+        ConstantExpr::getIntToPtr(OffsetVal, PointerType::get(PtrTy,
+                                                              AddressSpace));
+  } else if (TT.isOSOpenBSD()) {
     StackGuardVar = M->getOrInsertGlobal("__guard_local", PtrTy);
     cast<GlobalValue>(StackGuardVar)
         ->setVisibility(GlobalValue::HiddenVisibility);
@@ -399,14 +394,13 @@
         InsertionPt = RI;
         // At this point we know that BB has a return statement so it *DOES*
         // have a terminator.
-        assert(InsertionPt != nullptr && "BB must have a terminator instruction at "
-                                   "this point.");
+        assert(InsertionPt != nullptr &&
+               "BB must have a terminator instruction at this point.");
       }
 
       Function *Intrinsic =
           Intrinsic::getDeclaration(M, Intrinsic::stackprotectorcheck);
       CallInst::Create(Intrinsic, StackGuardVar, "", InsertionPt);
-
     } else {
       // If we do not support SelectionDAG based tail calls, generate IR level
       // tail calls.
@@ -459,11 +453,17 @@
       LoadInst *LI1 = B.CreateLoad(StackGuardVar);
       LoadInst *LI2 = B.CreateLoad(AI);
       Value *Cmp = B.CreateICmpEQ(LI1, LI2);
-      B.CreateCondBr(Cmp, NewBB, FailBB);
+      unsigned SuccessWeight =
+          BranchProbabilityInfo::getBranchWeightStackProtector(true);
+      unsigned FailureWeight =
+          BranchProbabilityInfo::getBranchWeightStackProtector(false);
+      MDNode *Weights = MDBuilder(F->getContext())
+                            .createBranchWeights(SuccessWeight, FailureWeight);
+      B.CreateCondBr(Cmp, NewBB, FailBB, Weights);
     }
   }
 
-  // Return if we didn't modify any basic blocks. I.e., there are no return
+  // Return if we didn't modify any basic blocks. i.e., there are no return
   // statements in the function.
   if (!HasPrologue)
     return false;
@@ -477,15 +477,17 @@
   LLVMContext &Context = F->getContext();
   BasicBlock *FailBB = BasicBlock::Create(Context, "CallStackCheckFailBlk", F);
   IRBuilder<> B(FailBB);
-  if (Trip.getOS() == llvm::Triple::OpenBSD) {
-    Constant *StackChkFail = M->getOrInsertFunction(
-        "__stack_smash_handler", Type::getVoidTy(Context),
-        Type::getInt8PtrTy(Context), nullptr);
+  if (Trip.isOSOpenBSD()) {
+    Constant *StackChkFail =
+        M->getOrInsertFunction("__stack_smash_handler",
+                               Type::getVoidTy(Context),
+                               Type::getInt8PtrTy(Context), nullptr);
 
     B.CreateCall(StackChkFail, B.CreateGlobalStringPtr(F->getName(), "SSH"));
   } else {
-    Constant *StackChkFail = M->getOrInsertFunction(
-        "__stack_chk_fail", Type::getVoidTy(Context), nullptr);
+    Constant *StackChkFail =
+        M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context),
+                               nullptr);
     B.CreateCall(StackChkFail);
   }
   B.CreateUnreachable();

diff --git a/lib/CodeGen/StatepointExampleGC.cpp b/lib/CodeGen/StatepointExampleGC.cpp
new file mode 100644
index 0000000..95dfd75
--- /dev/null
+++ b/lib/CodeGen/StatepointExampleGC.cpp

@@ -0,0 +1,55 @@
+//===-- StatepointDefaultGC.cpp - The default statepoint GC strategy ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a GCStrategy which serves as an example for the usage
+// of a statepoint based lowering strategy.  This GCStrategy is intended to
+// suitable as a default implementation usable with any collector which can
+// consume the standard stackmap format generated by statepoints, uses the
+// default addrespace to distinguish between gc managed and non-gc managed
+// pointers, and has reasonable relocation semantics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Value.h"
+
+using namespace llvm;
+
+namespace {
+class StatepointGC : public GCStrategy {
+public:
+  StatepointGC() {
+    UseStatepoints = true;
+    // These options are all gc.root specific, we specify them so that the
+    // gc.root lowering code doesn't run.
+    InitRoots = false;
+    NeededSafePoints = 0;
+    UsesMetadata = false;
+    CustomRoots = false;
+  }
+  Optional<bool> isGCManagedPointer(const Value *V) const override {
+    // Method is only valid on pointer typed values.
+    PointerType *PT = cast<PointerType>(V->getType());
+    // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
+    // GC managed heap.  We know that a pointer into this heap needs to be
+    // updated and that no other pointer does.  Note that addrspace(1) is used
+    // only as an example, it has no special meaning, and is not reserved for
+    // GC usage.
+    return (1 == PT->getAddressSpace());
+  }
+};
+}
+
+static GCRegistry::Add<StatepointGC> X("statepoint-example",
+                                       "an example strategy for statepoint");
+
+namespace llvm {
+void linkStatepointExampleGC() {}
+}

diff --git a/lib/CodeGen/TailDuplication.cpp b/lib/CodeGen/TailDuplication.cpp
index 4377236..04b3992 100644
--- a/lib/CodeGen/TailDuplication.cpp
+++ b/lib/CodeGen/TailDuplication.cpp

@@ -560,8 +560,7 @@
   // compensate for the duplication.
   unsigned MaxDuplicateCount;
   if (TailDuplicateSize.getNumOccurrences() == 0 &&
-      MF.getFunction()->getAttributes().
-        hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize))
+      MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize))
     MaxDuplicateCount = 1;
   else
     MaxDuplicateCount = TailDuplicateSize;

diff --git a/lib/CodeGen/TargetFrameLoweringImpl.cpp b/lib/CodeGen/TargetFrameLoweringImpl.cpp
index 1557d10..e3f0191 100644
--- a/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/lib/CodeGen/TargetFrameLoweringImpl.cpp

@@ -42,3 +42,8 @@
   FrameReg = RI->getFrameRegister(MF);
   return getFrameIndexOffset(MF, FI);
 }
+
+bool TargetFrameLowering::needsFrameIndexResolution(
+    const MachineFunction &MF) const {
+  return MF.getFrameInfo()->hasStackObjects();
+}

diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp
index ab45f89..2566c1f 100644
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp

@@ -25,6 +25,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -307,7 +308,7 @@
 
   assert(RC->getSize() >= (Offset + Size) && "bad subregister range");
 
-  if (!TM->getSubtargetImpl()->getDataLayout()->isLittleEndian()) {
+  if (!TM->getDataLayout()->isLittleEndian()) {
     Offset = RC->getSize() - (Offset + Size);
   }
   return true;
@@ -644,6 +645,28 @@
   return true;
 }
 
+int TargetInstrInfo::getSPAdjust(const MachineInstr *MI) const {
+  const MachineFunction *MF = MI->getParent()->getParent();
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  bool StackGrowsDown =
+    TFI->getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+  int FrameSetupOpcode = getCallFrameSetupOpcode();
+  int FrameDestroyOpcode = getCallFrameDestroyOpcode();
+
+  if (MI->getOpcode() != FrameSetupOpcode &&
+      MI->getOpcode() != FrameDestroyOpcode)
+    return 0;
+ 
+  int SPAdj = MI->getOperand(0).getImm();
+
+  if ((!StackGrowsDown && MI->getOpcode() == FrameSetupOpcode) ||
+       (StackGrowsDown && MI->getOpcode() == FrameDestroyOpcode))
+    SPAdj = -SPAdj;
+
+  return SPAdj;
+}
+
 /// isSchedulingBoundary - Test if the given instruction should be
 /// considered a scheduling boundary. This primarily includes labels
 /// and terminators.

diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index e833fd3..9048a44 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp

@@ -414,7 +414,7 @@
     Names[RTLIB::SINCOS_PPCF128] = nullptr;
   }
 
-  if (TT.getOS() != Triple::OpenBSD) {
+  if (!TT.isOSOpenBSD()) {
     Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = "__stack_chk_fail";
   } else {
     // These are generally not available.
@@ -696,7 +696,7 @@
 
 /// NOTE: The TargetMachine owns TLOF.
 TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm)
-    : TM(tm), DL(TM.getSubtargetImpl()->getDataLayout()) {
+    : TM(tm), DL(TM.getDataLayout()) {
   initActions();
 
   // Perform these initializations only once.
@@ -710,10 +710,12 @@
   HasMultipleConditionRegisters = false;
   HasExtractBitsInsn = false;
   IntDivIsCheap = false;
+  FsqrtIsCheap = false;
   Pow2SDivIsCheap = false;
   JumpIsExpensive = false;
   PredictableSelectIsExpensive = false;
   MaskAndBranchFoldingIsLegal = false;
+  EnableExtLdPromotion = false;
   HasFloatingPointExceptions = true;
   StackPointerRegisterToSaveRestore = 0;
   ExceptionPointerRegister = 0;
@@ -747,37 +749,33 @@
   memset(TargetDAGCombineArray, 0, array_lengthof(TargetDAGCombineArray));
 
   // Set default actions for various operations.
-  for (unsigned VT = 0; VT != (unsigned)MVT::LAST_VALUETYPE; ++VT) {
+  for (MVT VT : MVT::all_valuetypes()) {
     // Default all indexed load / store to expand.
     for (unsigned IM = (unsigned)ISD::PRE_INC;
          IM != (unsigned)ISD::LAST_INDEXED_MODE; ++IM) {
-      setIndexedLoadAction(IM, (MVT::SimpleValueType)VT, Expand);
-      setIndexedStoreAction(IM, (MVT::SimpleValueType)VT, Expand);
+      setIndexedLoadAction(IM, VT, Expand);
+      setIndexedStoreAction(IM, VT, Expand);
     }
 
     // Most backends expect to see the node which just returns the value loaded.
-    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
-                       (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Expand);
 
     // These operations default to expand.
-    setOperationAction(ISD::FGETSIGN, (MVT::SimpleValueType)VT, Expand);
-    setOperationAction(ISD::CONCAT_VECTORS, (MVT::SimpleValueType)VT, Expand);
-    setOperationAction(ISD::FMINNUM, (MVT::SimpleValueType)VT, Expand);
-    setOperationAction(ISD::FMAXNUM, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FGETSIGN, VT, Expand);
+    setOperationAction(ISD::CONCAT_VECTORS, VT, Expand);
+    setOperationAction(ISD::FMINNUM, VT, Expand);
+    setOperationAction(ISD::FMAXNUM, VT, Expand);
+    setOperationAction(ISD::FMAD, VT, Expand);
 
     // These library functions default to expand.
-    setOperationAction(ISD::FROUND, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::FROUND, VT, Expand);
 
     // These operations default to expand for vector types.
-    if (VT >= MVT::FIRST_VECTOR_VALUETYPE &&
-        VT <= MVT::LAST_VECTOR_VALUETYPE) {
-      setOperationAction(ISD::FCOPYSIGN, (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG,
-                         (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG,
-                         (MVT::SimpleValueType)VT, Expand);
-      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG,
-                         (MVT::SimpleValueType)VT, Expand);
+    if (VT.isVector()) {
+      setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+      setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, VT, Expand);
+      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Expand);
+      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Expand);
     }
   }
 
@@ -897,6 +895,138 @@
   }
 }
 
+TargetLoweringBase::LegalizeKind
+TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
+  // If this is a simple type, use the ComputeRegisterProp mechanism.
+  if (VT.isSimple()) {
+    MVT SVT = VT.getSimpleVT();
+    assert((unsigned)SVT.SimpleTy < array_lengthof(TransformToType));
+    MVT NVT = TransformToType[SVT.SimpleTy];
+    LegalizeTypeAction LA = ValueTypeActions.getTypeAction(SVT);
+
+    assert((LA == TypeLegal || LA == TypeSoftenFloat ||
+            ValueTypeActions.getTypeAction(NVT) != TypePromoteInteger) &&
+           "Promote may not follow Expand or Promote");
+
+    if (LA == TypeSplitVector)
+      return LegalizeKind(LA,
+                          EVT::getVectorVT(Context, SVT.getVectorElementType(),
+                                           SVT.getVectorNumElements() / 2));
+    if (LA == TypeScalarizeVector)
+      return LegalizeKind(LA, SVT.getVectorElementType());
+    return LegalizeKind(LA, NVT);
+  }
+
+  // Handle Extended Scalar Types.
+  if (!VT.isVector()) {
+    assert(VT.isInteger() && "Float types must be simple");
+    unsigned BitSize = VT.getSizeInBits();
+    // First promote to a power-of-two size, then expand if necessary.
+    if (BitSize < 8 || !isPowerOf2_32(BitSize)) {
+      EVT NVT = VT.getRoundIntegerType(Context);
+      assert(NVT != VT && "Unable to round integer VT");
+      LegalizeKind NextStep = getTypeConversion(Context, NVT);
+      // Avoid multi-step promotion.
+      if (NextStep.first == TypePromoteInteger)
+        return NextStep;
+      // Return rounded integer type.
+      return LegalizeKind(TypePromoteInteger, NVT);
+    }
+
+    return LegalizeKind(TypeExpandInteger,
+                        EVT::getIntegerVT(Context, VT.getSizeInBits() / 2));
+  }
+
+  // Handle vector types.
+  unsigned NumElts = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+
+  // Vectors with only one element are always scalarized.
+  if (NumElts == 1)
+    return LegalizeKind(TypeScalarizeVector, EltVT);
+
+  // Try to widen vector elements until the element type is a power of two and
+  // promote it to a legal type later on, for example:
+  // <3 x i8> -> <4 x i8> -> <4 x i32>
+  if (EltVT.isInteger()) {
+    // Vectors with a number of elements that is not a power of two are always
+    // widened, for example <3 x i8> -> <4 x i8>.
+    if (!VT.isPow2VectorType()) {
+      NumElts = (unsigned)NextPowerOf2(NumElts);
+      EVT NVT = EVT::getVectorVT(Context, EltVT, NumElts);
+      return LegalizeKind(TypeWidenVector, NVT);
+    }
+
+    // Examine the element type.
+    LegalizeKind LK = getTypeConversion(Context, EltVT);
+
+    // If type is to be expanded, split the vector.
+    //  <4 x i140> -> <2 x i140>
+    if (LK.first == TypeExpandInteger)
+      return LegalizeKind(TypeSplitVector,
+                          EVT::getVectorVT(Context, EltVT, NumElts / 2));
+
+    // Promote the integer element types until a legal vector type is found
+    // or until the element integer type is too big. If a legal type was not
+    // found, fallback to the usual mechanism of widening/splitting the
+    // vector.
+    EVT OldEltVT = EltVT;
+    while (1) {
+      // Increase the bitwidth of the element to the next pow-of-two
+      // (which is greater than 8 bits).
+      EltVT = EVT::getIntegerVT(Context, 1 + EltVT.getSizeInBits())
+                  .getRoundIntegerType(Context);
+
+      // Stop trying when getting a non-simple element type.
+      // Note that vector elements may be greater than legal vector element
+      // types. Example: X86 XMM registers hold 64bit element on 32bit
+      // systems.
+      if (!EltVT.isSimple())
+        break;
+
+      // Build a new vector type and check if it is legal.
+      MVT NVT = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
+      // Found a legal promoted vector type.
+      if (NVT != MVT() && ValueTypeActions.getTypeAction(NVT) == TypeLegal)
+        return LegalizeKind(TypePromoteInteger,
+                            EVT::getVectorVT(Context, EltVT, NumElts));
+    }
+
+    // Reset the type to the unexpanded type if we did not find a legal vector
+    // type with a promoted vector element type.
+    EltVT = OldEltVT;
+  }
+
+  // Try to widen the vector until a legal type is found.
+  // If there is no wider legal type, split the vector.
+  while (1) {
+    // Round up to the next power of 2.
+    NumElts = (unsigned)NextPowerOf2(NumElts);
+
+    // If there is no simple vector type with this many elements then there
+    // cannot be a larger legal vector type.  Note that this assumes that
+    // there are no skipped intermediate vector types in the simple types.
+    if (!EltVT.isSimple())
+      break;
+    MVT LargerVector = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
+    if (LargerVector == MVT())
+      break;
+
+    // If this type is legal then widen the vector.
+    if (ValueTypeActions.getTypeAction(LargerVector) == TypeLegal)
+      return LegalizeKind(TypeWidenVector, LargerVector);
+  }
+
+  // Widen odd vectors to next power of two.
+  if (!VT.isPow2VectorType()) {
+    EVT NVT = VT.getPow2VectorType(Context);
+    return LegalizeKind(TypeWidenVector, NVT);
+  }
+
+  // Vectors with illegal element types are expanded.
+  EVT NVT = EVT::getVectorVT(Context, EltVT, VT.getVectorNumElements() / 2);
+  return LegalizeKind(TypeSplitVector, NVT);
+}
 
 static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
                                           unsigned &NumIntermediates,
@@ -992,10 +1122,15 @@
     // Add a new memory operand for this FI.
     const MachineFrameInfo &MFI = *MF.getFrameInfo();
     assert(MFI.getObjectOffset(FI) != -1);
+
+    unsigned Flags = MachineMemOperand::MOLoad;
+    if (MI->getOpcode() == TargetOpcode::STATEPOINT) {
+      Flags |= MachineMemOperand::MOStore;
+      Flags |= MachineMemOperand::MOVolatile;
+    }
     MachineMemOperand *MMO = MF.getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(FI), MachineMemOperand::MOLoad,
-        TM.getSubtargetImpl()->getDataLayout()->getPointerSize(),
-        MFI.getObjectAlignment(FI));
+        MachinePointerInfo::getFixedStack(FI), Flags,
+        TM.getDataLayout()->getPointerSize(), MFI.getObjectAlignment(FI));
     MIB->addMemOperand(MF, MMO);
 
     // Replace the instruction and update the operand index.
@@ -1009,10 +1144,9 @@
 
 /// findRepresentativeClass - Return the largest legal super-reg register class
 /// of the register class for the specified type and its associated "cost".
-std::pair<const TargetRegisterClass*, uint8_t>
-TargetLoweringBase::findRepresentativeClass(MVT VT) const {
-  const TargetRegisterInfo *TRI =
-      getTargetMachine().getSubtargetImpl()->getRegisterInfo();
+std::pair<const TargetRegisterClass *, uint8_t>
+TargetLoweringBase::findRepresentativeClass(const TargetRegisterInfo *TRI,
+                                            MVT VT) const {
   const TargetRegisterClass *RC = RegClassForVT[VT.SimpleTy];
   if (!RC)
     return std::make_pair(RC, 0);
@@ -1038,7 +1172,8 @@
 
 /// computeRegisterProperties - Once all of the register classes are added,
 /// this allows us to compute derived properties we expose.
-void TargetLoweringBase::computeRegisterProperties() {
+void TargetLoweringBase::computeRegisterProperties(
+    const TargetRegisterInfo *TRI) {
   static_assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE,
                 "Too many value types for ValueTypeActions to hold!");
 
@@ -1220,7 +1355,7 @@
   for (unsigned i = 0; i != MVT::LAST_VALUETYPE; ++i) {
     const TargetRegisterClass* RRC;
     uint8_t Cost;
-    std::tie(RRC, Cost) = findRepresentativeClass((MVT::SimpleValueType)i);
+    std::tie(RRC, Cost) = findRepresentativeClass(TRI, (MVT::SimpleValueType)i);
     RepRegClassForVT[i] = RRC;
     RepRegClassCostForVT[i] = Cost;
   }

diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index efd15e1..c1b34f7 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp

@@ -71,12 +71,10 @@
   const MCSection *Sec = getContext().getELFSection(NameData,
                                                     ELF::SHT_PROGBITS,
                                                     Flags,
-                                                    SectionKind::getDataRel(),
                                                     0, Label->getName());
-  unsigned Size = TM.getSubtargetImpl()->getDataLayout()->getPointerSize();
+  unsigned Size = TM.getDataLayout()->getPointerSize();
   Streamer.SwitchSection(Sec);
-  Streamer.EmitValueToAlignment(
-      TM.getSubtargetImpl()->getDataLayout()->getPointerABIAlignment());
+  Streamer.EmitValueToAlignment(TM.getDataLayout()->getPointerABIAlignment());
   Streamer.EmitSymbolAttribute(Label, MCSA_ELF_TypeObject);
   const MCExpr *E = MCConstantExpr::Create(Size, getContext());
   Streamer.EmitELFSize(Label, E);
@@ -166,9 +164,7 @@
   return ELF::SHT_PROGBITS;
 }
 
-
-static unsigned
-getELFSectionFlags(SectionKind K) {
+static unsigned getELFSectionFlags(SectionKind K) {
   unsigned Flags = 0;
 
   if (!K.isMetadata())
@@ -183,9 +179,7 @@
   if (K.isThreadLocal())
     Flags |= ELF::SHF_TLS;
 
-  // K.isMergeableConst() is left out to honour PR4650
-  if (K.isMergeableCString() || K.isMergeableConst4() ||
-      K.isMergeableConst8() || K.isMergeableConst16())
+  if (K.isMergeableCString() || K.isMergeableConst())
     Flags |= ELF::SHF_MERGE;
 
   if (K.isMergeableCString())
@@ -222,120 +216,121 @@
   }
   return getContext().getELFSection(SectionName,
                                     getELFSectionType(SectionName, Kind), Flags,
-                                    Kind, /*EntrySize=*/0, Group);
+                                    /*EntrySize=*/0, Group);
 }
 
-/// getSectionPrefixForGlobal - Return the section prefix name used by options
-/// FunctionsSections and DataSections.
+/// Return the section prefix name used by options FunctionsSections and
+/// DataSections.
 static StringRef getSectionPrefixForGlobal(SectionKind Kind) {
-  if (Kind.isText())                 return ".text.";
-  if (Kind.isReadOnly())             return ".rodata.";
-  if (Kind.isBSS())                  return ".bss.";
-
-  if (Kind.isThreadData())           return ".tdata.";
-  if (Kind.isThreadBSS())            return ".tbss.";
-
-  if (Kind.isDataNoRel())            return ".data.";
-  if (Kind.isDataRelLocal())         return ".data.rel.local.";
-  if (Kind.isDataRel())              return ".data.rel.";
-  if (Kind.isReadOnlyWithRelLocal()) return ".data.rel.ro.local.";
-
+  if (Kind.isText())
+    return ".text";
+  if (Kind.isReadOnly())
+    return ".rodata";
+  if (Kind.isBSS())
+    return ".bss";
+  if (Kind.isThreadData())
+    return ".tdata";
+  if (Kind.isThreadBSS())
+    return ".tbss";
+  if (Kind.isDataNoRel())
+    return ".data";
+  if (Kind.isDataRelLocal())
+    return ".data.rel.local";
+  if (Kind.isDataRel())
+    return ".data.rel";
+  if (Kind.isReadOnlyWithRelLocal())
+    return ".data.rel.ro.local";
   assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
-  return ".data.rel.ro.";
+  return ".data.rel.ro";
 }
 
 const MCSection *TargetLoweringObjectFileELF::
 SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
                        Mangler &Mang, const TargetMachine &TM) const {
+  unsigned Flags = getELFSectionFlags(Kind);
+
   // If we have -ffunction-section or -fdata-section then we should emit the
   // global value to a uniqued section specifically for it.
-  bool EmitUniquedSection;
-  if (Kind.isText())
-    EmitUniquedSection = TM.getFunctionSections();
-  else
-    EmitUniquedSection = TM.getDataSections();
+  bool EmitUniqueSection = false;
+  if (!(Flags & ELF::SHF_MERGE) && !Kind.isCommon()) {
+    if (Kind.isText())
+      EmitUniqueSection = TM.getFunctionSections();
+    else
+      EmitUniqueSection = TM.getDataSections();
+  }
+  EmitUniqueSection |= GV->hasComdat();
 
-  // If this global is linkonce/weak and the target handles this by emitting it
-  // into a 'uniqued' section name, create and return the section now.
-  if ((GV->isWeakForLinker() || EmitUniquedSection || GV->hasComdat()) &&
-      !Kind.isCommon()) {
-    StringRef Prefix = getSectionPrefixForGlobal(Kind);
-
-    SmallString<128> Name(Prefix);
-    TM.getNameWithPrefix(Name, GV, Mang, true);
-
-    StringRef Group = "";
-    unsigned Flags = getELFSectionFlags(Kind);
-    if (GV->isWeakForLinker() || GV->hasComdat()) {
-      if (const Comdat *C = getELFComdat(GV))
-        Group = C->getName();
-      else
-        Group = Name.substr(Prefix.size());
-      Flags |= ELF::SHF_GROUP;
+  unsigned EntrySize = 0;
+  if (Kind.isMergeableCString()) {
+    if (Kind.isMergeable2ByteCString()) {
+      EntrySize = 2;
+    } else if (Kind.isMergeable4ByteCString()) {
+      EntrySize = 4;
+    } else {
+      EntrySize = 1;
+      assert(Kind.isMergeable1ByteCString() && "unknown string width");
     }
-
-    return getContext().getELFSection(Name.str(),
-                                      getELFSectionType(Name.str(), Kind),
-                                      Flags, Kind, 0, Group);
+  } else if (Kind.isMergeableConst()) {
+    if (Kind.isMergeableConst4()) {
+      EntrySize = 4;
+    } else if (Kind.isMergeableConst8()) {
+      EntrySize = 8;
+    } else {
+      assert(Kind.isMergeableConst16() && "unknown data width");
+      EntrySize = 16;
+    }
   }
 
-  if (Kind.isText()) return TextSection;
+  StringRef Group = "";
+  if (const Comdat *C = getELFComdat(GV)) {
+    Flags |= ELF::SHF_GROUP;
+    Group = C->getName();
+  }
 
-  if (Kind.isMergeable1ByteCString() ||
-      Kind.isMergeable2ByteCString() ||
-      Kind.isMergeable4ByteCString()) {
-
+  bool UniqueSectionNames = TM.getUniqueSectionNames();
+  SmallString<128> Name;
+  if (Kind.isMergeableCString()) {
     // We also need alignment here.
     // FIXME: this is getting the alignment of the character, not the
     // alignment of the global!
     unsigned Align =
-        TM.getSubtargetImpl()->getDataLayout()->getPreferredAlignment(
-            cast<GlobalVariable>(GV));
+        TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV));
 
-    const char *SizeSpec = ".rodata.str1.";
-    if (Kind.isMergeable2ByteCString())
-      SizeSpec = ".rodata.str2.";
-    else if (Kind.isMergeable4ByteCString())
-      SizeSpec = ".rodata.str4.";
-    else
-      assert(Kind.isMergeable1ByteCString() && "unknown string width");
-
-
-    std::string Name = SizeSpec + utostr(Align);
-    return getContext().getELFSection(Name, ELF::SHT_PROGBITS,
-                                      ELF::SHF_ALLOC |
-                                      ELF::SHF_MERGE |
-                                      ELF::SHF_STRINGS,
-                                      Kind);
+    std::string SizeSpec = ".rodata.str" + utostr(EntrySize) + ".";
+    Name = SizeSpec + utostr(Align);
+  } else if (Kind.isMergeableConst()) {
+    Name = ".rodata.cst";
+    Name += utostr(EntrySize);
+  } else {
+    Name = getSectionPrefixForGlobal(Kind);
   }
 
-  if (Kind.isMergeableConst()) {
-    if (Kind.isMergeableConst4() && MergeableConst4Section)
-      return MergeableConst4Section;
-    if (Kind.isMergeableConst8() && MergeableConst8Section)
-      return MergeableConst8Section;
-    if (Kind.isMergeableConst16() && MergeableConst16Section)
-      return MergeableConst16Section;
-    return ReadOnlySection;  // .const
+  if (EmitUniqueSection && UniqueSectionNames) {
+    Name.push_back('.');
+    TM.getNameWithPrefix(Name, GV, Mang, true);
   }
+  return getContext().getELFSection(Name, getELFSectionType(Name, Kind), Flags,
+                                    EntrySize, Group,
+                                    EmitUniqueSection && !UniqueSectionNames);
+}
 
-  if (Kind.isReadOnly())             return ReadOnlySection;
+const MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable(
+    const Function &F, Mangler &Mang, const TargetMachine &TM) const {
+  // If the function can be removed, produce a unique section so that
+  // the table doesn't prevent the removal.
+  const Comdat *C = F.getComdat();
+  bool EmitUniqueSection = TM.getFunctionSections() || C;
+  if (!EmitUniqueSection)
+    return ReadOnlySection;
 
-  if (Kind.isThreadData())           return TLSDataSection;
-  if (Kind.isThreadBSS())            return TLSBSSSection;
+  return SelectSectionForGlobal(&F, SectionKind::getReadOnly(), Mang, TM);
+}
 
-  // Note: we claim that common symbols are put in BSSSection, but they are
-  // really emitted with the magic .comm directive, which creates a symbol table
-  // entry but not a section.
-  if (Kind.isBSS() || Kind.isCommon()) return BSSSection;
-
-  if (Kind.isDataNoRel())            return DataSection;
-  if (Kind.isDataRelLocal())         return DataRelLocalSection;
-  if (Kind.isDataRel())              return DataRelSection;
-  if (Kind.isReadOnlyWithRelLocal()) return DataRelROLocalSection;
-
-  assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
-  return DataRelROSection;
+bool TargetLoweringObjectFileELF::shouldPutJumpTableInFunctionSection(
+    bool UsesLabelDifference, const Function &F) const {
+  // We can always create relative relocations, so use another section
+  // that can be marked non-executable.
+  return false;
 }
 
 /// getSectionForConstant - Given a mergeable constant with the
@@ -366,7 +361,6 @@
   std::string Name;
   unsigned Type;
   unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_WRITE;
-  SectionKind Kind = SectionKind::getDataRel();
   StringRef COMDAT = KeySym ? KeySym->getName() : "";
 
   if (KeySym)
@@ -398,7 +392,7 @@
     Type = ELF::SHT_PROGBITS;
   }
 
-  return Ctx.getELFSection(Name, Type, Flags, Kind, 0, COMDAT);
+  return Ctx.getELFSection(Name, Type, Flags, 0, COMDAT);
 }
 
 const MCSection *TargetLoweringObjectFileELF::getStaticCtorSection(
@@ -419,16 +413,10 @@
   if (!UseInitArray)
     return;
 
-  StaticCtorSection =
-    getContext().getELFSection(".init_array", ELF::SHT_INIT_ARRAY,
-                               ELF::SHF_WRITE |
-                               ELF::SHF_ALLOC,
-                               SectionKind::getDataRel());
-  StaticDtorSection =
-    getContext().getELFSection(".fini_array", ELF::SHT_FINI_ARRAY,
-                               ELF::SHF_WRITE |
-                               ELF::SHF_ALLOC,
-                               SectionKind::getDataRel());
+  StaticCtorSection = getContext().getELFSection(
+      ".init_array", ELF::SHT_INIT_ARRAY, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+  StaticDtorSection = getContext().getELFSection(
+      ".fini_array", ELF::SHT_FINI_ARRAY, ELF::SHF_WRITE | ELF::SHF_ALLOC);
 }
 
 //===----------------------------------------------------------------------===//
@@ -464,14 +452,15 @@
       continue;
 
     StringRef Key = MFE.Key->getString();
-    Value *Val = MFE.Val;
+    Metadata *Val = MFE.Val;
 
     if (Key == "Objective-C Image Info Version") {
-      VersionVal = cast<ConstantInt>(Val)->getZExtValue();
+      VersionVal = mdconst::extract<ConstantInt>(Val)->getZExtValue();
     } else if (Key == "Objective-C Garbage Collection" ||
                Key == "Objective-C GC Only" ||
-               Key == "Objective-C Is Simulated") {
-      ImageInfoFlags |= cast<ConstantInt>(Val)->getZExtValue();
+               Key == "Objective-C Is Simulated" ||
+               Key == "Objective-C Image Swift Version") {
+      ImageInfoFlags |= mdconst::extract<ConstantInt>(Val)->getZExtValue();
     } else if (Key == "Objective-C Image Info Section") {
       SectionVal = cast<MDString>(Val)->getString();
     } else if (Key == "Linker Options") {
@@ -572,60 +561,6 @@
   return S;
 }
 
-bool TargetLoweringObjectFileMachO::isSectionAtomizableBySymbols(
-    const MCSection &Section) const {
-    const MCSectionMachO &SMO = static_cast<const MCSectionMachO&>(Section);
-
-    // Sections holding 1 byte strings are atomized based on the data
-    // they contain.
-    // Sections holding 2 byte strings require symbols in order to be
-    // atomized.
-    // There is no dedicated section for 4 byte strings.
-    if (SMO.getKind().isMergeable1ByteCString())
-      return false;
-
-    if (SMO.getSegmentName() == "__TEXT" &&
-        SMO.getSectionName() == "__objc_classname" &&
-        SMO.getType() == MachO::S_CSTRING_LITERALS)
-      return false;
-
-    if (SMO.getSegmentName() == "__TEXT" &&
-        SMO.getSectionName() == "__objc_methname" &&
-        SMO.getType() == MachO::S_CSTRING_LITERALS)
-      return false;
-
-    if (SMO.getSegmentName() == "__TEXT" &&
-        SMO.getSectionName() == "__objc_methtype" &&
-        SMO.getType() == MachO::S_CSTRING_LITERALS)
-      return false;
-
-    if (SMO.getSegmentName() == "__DATA" &&
-        SMO.getSectionName() == "__cfstring")
-      return false;
-
-    // no_dead_strip sections are not atomized in practice.
-    if (SMO.hasAttribute(MachO::S_ATTR_NO_DEAD_STRIP))
-      return false;
-
-    switch (SMO.getType()) {
-    default:
-      return true;
-
-      // These sections are atomized at the element boundaries without using
-      // symbols.
-    case MachO::S_4BYTE_LITERALS:
-    case MachO::S_8BYTE_LITERALS:
-    case MachO::S_16BYTE_LITERALS:
-    case MachO::S_LITERAL_POINTERS:
-    case MachO::S_NON_LAZY_SYMBOL_POINTERS:
-    case MachO::S_LAZY_SYMBOL_POINTERS:
-    case MachO::S_MOD_INIT_FUNC_POINTERS:
-    case MachO::S_MOD_TERM_FUNC_POINTERS:
-    case MachO::S_INTERPOSING:
-      return false;
-    }
-}
-
 const MCSection *TargetLoweringObjectFileMachO::
 SelectSectionForGlobal(const GlobalValue *GV, SectionKind Kind,
                        Mangler &Mang, const TargetMachine &TM) const {
@@ -648,16 +583,14 @@
 
   // FIXME: Alignment check should be handled by section classifier.
   if (Kind.isMergeable1ByteCString() &&
-      TM.getSubtargetImpl()->getDataLayout()->getPreferredAlignment(
-          cast<GlobalVariable>(GV)) < 32)
+      TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV)) < 32)
     return CStringSection;
 
   // Do not put 16-bit arrays in the UString section if they have an
   // externally visible label, this runs into issues with certain linker
   // versions.
   if (Kind.isMergeable2ByteCString() && !GV->hasExternalLinkage() &&
-      TM.getSubtargetImpl()->getDataLayout()->getPreferredAlignment(
-          cast<GlobalVariable>(GV)) < 32)
+      TM.getDataLayout()->getPreferredAlignment(cast<GlobalVariable>(GV)) < 32)
     return UStringSection;
 
   // With MachO only variables whose corresponding symbol starts with 'l' or
@@ -854,7 +787,7 @@
   unsigned Characteristics = getCOFFSectionFlags(Kind);
   StringRef Name = GV->getSection();
   StringRef COMDATSymName = "";
-  if ((GV->isWeakForLinker() || GV->hasComdat()) && !Kind.isCommon()) {
+  if (GV->hasComdat()) {
     Selection = getSelectionForCOFF(GV);
     const GlobalValue *ComdatGV;
     if (Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
@@ -901,12 +834,7 @@
   else
     EmitUniquedSection = TM.getDataSections();
 
-  // If this global is linkonce/weak and the target handles this by emitting it
-  // into a 'uniqued' section name, create and return the section now.
-  // Section names depend on the name of the symbol which is not feasible if the
-  // symbol has private linkage.
-  if ((GV->isWeakForLinker() || EmitUniquedSection || GV->hasComdat()) &&
-      !Kind.isCommon()) {
+  if ((EmitUniquedSection && !Kind.isCommon()) || GV->hasComdat()) {
     const char *Name = getCOFFSectionNameForUniqueGlobal(Kind);
     unsigned Characteristics = getCOFFSectionFlags(Kind);
 
@@ -965,7 +893,7 @@
        i = ModuleFlags.begin(), e = ModuleFlags.end(); i != e; ++i) {
     const Module::ModuleFlagEntry &MFE = *i;
     StringRef Key = MFE.Key->getString();
-    Value *Val = MFE.Val;
+    Metadata *Val = MFE.Val;
     if (Key == "Linker Options") {
       LinkerOptions = cast<MDNode>(Val);
       break;
@@ -982,21 +910,10 @@
     MDNode *MDOptions = cast<MDNode>(LinkerOptions->getOperand(i));
     for (unsigned ii = 0, ie = MDOptions->getNumOperands(); ii != ie; ++ii) {
       MDString *MDOption = cast<MDString>(MDOptions->getOperand(ii));
-      StringRef Op = MDOption->getString();
       // Lead with a space for consistency with our dllexport implementation.
-      std::string Escaped(" ");
-      if (Op.find(" ") != StringRef::npos) {
-        // The PE-COFF spec says args with spaces must be quoted.  It doesn't say
-        // how to escape quotes, but it probably uses this algorithm:
-        // http://msdn.microsoft.com/en-us/library/17w5ykft(v=vs.85).aspx
-        // FIXME: Reuse escaping code from Support/Windows/Program.inc
-        Escaped.push_back('\"');
-        Escaped.append(Op);
-        Escaped.push_back('\"');
-      } else {
-        Escaped.append(Op);
-      }
-      Streamer.EmitBytes(Escaped);
+      std::string Directive(" ");
+      Directive.append(MDOption->getString());
+      Streamer.EmitBytes(Directive);
     }
   }
 }

diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index e218a83..1bbe6e1 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp

@@ -1515,9 +1515,9 @@
   MF = &Func;
   const TargetMachine &TM = MF->getTarget();
   MRI = &MF->getRegInfo();
-  TII = TM.getSubtargetImpl()->getInstrInfo();
-  TRI = TM.getSubtargetImpl()->getRegisterInfo();
-  InstrItins = TM.getSubtargetImpl()->getInstrItineraryData();
+  TII = MF->getSubtarget().getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  InstrItins = MF->getSubtarget().getInstrItineraryData();
   LV = getAnalysisIfAvailable<LiveVariables>();
   LIS = getAnalysisIfAvailable<LiveIntervals>();
   AA = &getAnalysis<AliasAnalysis>();

diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index 7824f92..d393e10 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp

@@ -88,7 +88,7 @@
     DeadBlocks[i]->eraseFromParent();
   }
 
-  return DeadBlocks.size();
+  return !DeadBlocks.empty();
 }
 
 
@@ -204,5 +204,5 @@
 
   F.RenumberBlocks();
 
-  return (DeadBlocks.size() || ModifiedPHI);
+  return (!DeadBlocks.empty() || ModifiedPHI);
 }

diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 0d17d43..7d3b0ce 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp

@@ -252,20 +252,41 @@
     unsigned PhysReg = VRM->getPhys(VirtReg);
     assert(PhysReg != VirtRegMap::NO_PHYS_REG && "Unmapped virtual register.");
 
-    // Scan the segments of LI.
-    for (LiveInterval::const_iterator I = LI.begin(), E = LI.end(); I != E;
-         ++I) {
-      if (!Indexes->findLiveInMBBs(I->start, I->end, LiveIn))
-        continue;
-      for (unsigned i = 0, e = LiveIn.size(); i != e; ++i)
-        if (!LiveIn[i]->isLiveIn(PhysReg))
-          LiveIn[i]->addLiveIn(PhysReg);
-      LiveIn.clear();
+    if (LI.hasSubRanges()) {
+      for (LiveInterval::SubRange &S : LI.subranges()) {
+        for (const auto &Seg : S.segments) {
+          if (!Indexes->findLiveInMBBs(Seg.start, Seg.end, LiveIn))
+            continue;
+          for (MCSubRegIndexIterator SR(PhysReg, TRI); SR.isValid(); ++SR) {
+            unsigned SubReg = SR.getSubReg();
+            unsigned SubRegIndex = SR.getSubRegIndex();
+            unsigned SubRegLaneMask = TRI->getSubRegIndexLaneMask(SubRegIndex);
+            if ((SubRegLaneMask & S.LaneMask) == 0)
+              continue;
+            for (unsigned i = 0, e = LiveIn.size(); i != e; ++i) {
+              if (!LiveIn[i]->isLiveIn(SubReg))
+                LiveIn[i]->addLiveIn(SubReg);
+            }
+          }
+          LiveIn.clear();
+        }
+      }
+    } else {
+      // Scan the segments of LI.
+      for (const auto &Seg : LI.segments) {
+        if (!Indexes->findLiveInMBBs(Seg.start, Seg.end, LiveIn))
+          continue;
+        for (unsigned i = 0, e = LiveIn.size(); i != e; ++i)
+          if (!LiveIn[i]->isLiveIn(PhysReg))
+            LiveIn[i]->addLiveIn(PhysReg);
+        LiveIn.clear();
+      }
     }
   }
 }
 
 void VirtRegRewriter::rewrite() {
+  bool NoSubRegLiveness = !MRI->tracksSubRegLiveness();
   SmallVector<unsigned, 8> SuperDeads;
   SmallVector<unsigned, 8> SuperDefs;
   SmallVector<unsigned, 8> SuperKills;
@@ -347,7 +368,8 @@
           // A virtual register kill refers to the whole register, so we may
           // have to add <imp-use,kill> operands for the super-register.  A
           // partial redef always kills and redefines the super-register.
-          if (MO.readsReg() && (MO.isDef() || MO.isKill()))
+          if (NoSubRegLiveness && MO.readsReg()
+              && (MO.isDef() || MO.isKill()))
             SuperKills.push_back(PhysReg);
 
           if (MO.isDef()) {
@@ -358,10 +380,12 @@
             MO.setIsUndef(false);
 
             // Also add implicit defs for the super-register.
-            if (MO.isDead())
-              SuperDeads.push_back(PhysReg);
-            else
-              SuperDefs.push_back(PhysReg);
+            if (NoSubRegLiveness) {
+              if (MO.isDead())
+                SuperDeads.push_back(PhysReg);
+              else
+                SuperDefs.push_back(PhysReg);
+            }
           }
 
           // PhysReg operands cannot have subregister indexes.

diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp
new file mode 100644
index 0000000..6f712a9
--- /dev/null
+++ b/lib/CodeGen/WinEHPrepare.cpp

@@ -0,0 +1,626 @@
+//===-- WinEHPrepare - Prepare exception handling for code generation ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers LLVM IR exception handling into something closer to what the
+// backend wants. It snifs the personality function to see which kind of
+// preparation is necessary. If the personality function uses the Itanium LSDA,
+// this pass delegates to the DWARF EH preparation pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/LibCallSemantics.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <memory>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "winehprepare"
+
+namespace {
+
+struct HandlerAllocas {
+  TinyPtrVector<AllocaInst *> Allocas;
+  int ParentFrameAllocationIndex;
+};
+
+// This map is used to model frame variable usage during outlining, to
+// construct a structure type to hold the frame variables in a frame
+// allocation block, and to remap the frame variable allocas (including
+// spill locations as needed) to GEPs that get the variable from the
+// frame allocation structure.
+typedef MapVector<AllocaInst *, HandlerAllocas> FrameVarInfoMap;
+
+class WinEHPrepare : public FunctionPass {
+  std::unique_ptr<FunctionPass> DwarfPrepare;
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  WinEHPrepare(const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), DwarfPrepare(createDwarfEHPass(TM)) {}
+
+  bool runOnFunction(Function &Fn) override;
+
+  bool doFinalization(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  const char *getPassName() const override {
+    return "Windows exception handling preparation";
+  }
+
+private:
+  bool prepareCPPEHHandlers(Function &F,
+                            SmallVectorImpl<LandingPadInst *> &LPads);
+  bool outlineCatchHandler(Function *SrcFn, Constant *SelectorType,
+                           LandingPadInst *LPad, CallInst *&EHAlloc,
+                           AllocaInst *&EHObjPtr, FrameVarInfoMap &VarInfo);
+};
+
+class WinEHFrameVariableMaterializer : public ValueMaterializer {
+public:
+  WinEHFrameVariableMaterializer(Function *OutlinedFn,
+                                 FrameVarInfoMap &FrameVarInfo);
+  ~WinEHFrameVariableMaterializer() {}
+
+  virtual Value *materializeValueFor(Value *V) override;
+
+private:
+  FrameVarInfoMap &FrameVarInfo;
+  IRBuilder<> Builder;
+};
+
+class WinEHCatchDirector : public CloningDirector {
+public:
+  WinEHCatchDirector(LandingPadInst *LPI, Function *CatchFn, Value *Selector,
+                     Value *EHObj, FrameVarInfoMap &VarInfo)
+      : LPI(LPI), CurrentSelector(Selector->stripPointerCasts()), EHObj(EHObj),
+        Materializer(CatchFn, VarInfo),
+        SelectorIDType(Type::getInt32Ty(LPI->getContext())),
+        Int8PtrType(Type::getInt8PtrTy(LPI->getContext())) {}
+
+  CloningAction handleInstruction(ValueToValueMapTy &VMap,
+                                  const Instruction *Inst,
+                                  BasicBlock *NewBB) override;
+
+  ValueMaterializer *getValueMaterializer() override { return &Materializer; }
+
+private:
+  LandingPadInst *LPI;
+  Value *CurrentSelector;
+  Value *EHObj;
+  WinEHFrameVariableMaterializer Materializer;
+  Type *SelectorIDType;
+  Type *Int8PtrType;
+
+  const Value *ExtractedEHPtr;
+  const Value *ExtractedSelector;
+  const Value *EHPtrStoreAddr;
+  const Value *SelectorStoreAddr;
+};
+} // end anonymous namespace
+
+char WinEHPrepare::ID = 0;
+INITIALIZE_TM_PASS(WinEHPrepare, "winehprepare", "Prepare Windows exceptions",
+                   false, false)
+
+FunctionPass *llvm::createWinEHPass(const TargetMachine *TM) {
+  return new WinEHPrepare(TM);
+}
+
+static bool isMSVCPersonality(EHPersonality Pers) {
+  return Pers == EHPersonality::MSVC_Win64SEH ||
+         Pers == EHPersonality::MSVC_CXX;
+}
+
+bool WinEHPrepare::runOnFunction(Function &Fn) {
+  SmallVector<LandingPadInst *, 4> LPads;
+  SmallVector<ResumeInst *, 4> Resumes;
+  for (BasicBlock &BB : Fn) {
+    if (auto *LP = BB.getLandingPadInst())
+      LPads.push_back(LP);
+    if (auto *Resume = dyn_cast<ResumeInst>(BB.getTerminator()))
+      Resumes.push_back(Resume);
+  }
+
+  // No need to prepare functions that lack landing pads.
+  if (LPads.empty())
+    return false;
+
+  // Classify the personality to see what kind of preparation we need.
+  EHPersonality Pers = classifyEHPersonality(LPads.back()->getPersonalityFn());
+
+  // Delegate through to the DWARF pass if this is unrecognized.
+  if (!isMSVCPersonality(Pers))
+    return DwarfPrepare->runOnFunction(Fn);
+
+  // FIXME: This only returns true if the C++ EH handlers were outlined.
+  //        When that code is complete, it should always return whatever
+  //        prepareCPPEHHandlers returns.
+  if (Pers == EHPersonality::MSVC_CXX && prepareCPPEHHandlers(Fn, LPads))
+    return true;
+
+  // FIXME: SEH Cleanups are unimplemented. Replace them with unreachable.
+  if (Resumes.empty())
+    return false;
+
+  for (ResumeInst *Resume : Resumes) {
+    IRBuilder<>(Resume).CreateUnreachable();
+    Resume->eraseFromParent();
+  }
+
+  return true;
+}
+
+bool WinEHPrepare::doFinalization(Module &M) {
+  return DwarfPrepare->doFinalization(M);
+}
+
+void WinEHPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
+  DwarfPrepare->getAnalysisUsage(AU);
+}
+
+bool WinEHPrepare::prepareCPPEHHandlers(
+    Function &F, SmallVectorImpl<LandingPadInst *> &LPads) {
+  // These containers are used to re-map frame variables that are used in
+  // outlined catch and cleanup handlers.  They will be populated as the
+  // handlers are outlined.
+  FrameVarInfoMap FrameVarInfo;
+  SmallVector<CallInst *, 4> HandlerAllocs;
+  SmallVector<AllocaInst *, 4> HandlerEHObjPtrs;
+
+  bool HandlersOutlined = false;
+
+  for (LandingPadInst *LPad : LPads) {
+    // Look for evidence that this landingpad has already been processed.
+    bool LPadHasActionList = false;
+    BasicBlock *LPadBB = LPad->getParent();
+    for (Instruction &Inst : LPadBB->getInstList()) {
+      // FIXME: Make this an intrinsic.
+      if (auto *Call = dyn_cast<CallInst>(&Inst))
+        if (Call->getCalledFunction()->getName() == "llvm.eh.actions") {
+          LPadHasActionList = true;
+          break;
+        }
+    }
+
+    // If we've already outlined the handlers for this landingpad,
+    // there's nothing more to do here.
+    if (LPadHasActionList)
+      continue;
+
+    for (unsigned Idx = 0, NumClauses = LPad->getNumClauses(); Idx < NumClauses;
+         ++Idx) {
+      if (LPad->isCatch(Idx)) {
+        // Create a new instance of the handler data structure in the
+        // HandlerData vector.
+        CallInst *EHAlloc = nullptr;
+        AllocaInst *EHObjPtr = nullptr;
+        bool Outlined = outlineCatchHandler(&F, LPad->getClause(Idx), LPad,
+                                            EHAlloc, EHObjPtr, FrameVarInfo);
+        if (Outlined) {
+          HandlersOutlined = true;
+          // These values must be resolved after all handlers have been
+          // outlined.
+          if (EHAlloc)
+            HandlerAllocs.push_back(EHAlloc);
+          if (EHObjPtr)
+            HandlerEHObjPtrs.push_back(EHObjPtr);
+        }
+      } // End if (isCatch)
+    }   // End for each clause
+  }     // End for each landingpad
+
+  // If nothing got outlined, there is no more processing to be done.
+  if (!HandlersOutlined)
+    return false;
+
+  // FIXME: We will replace the landingpad bodies with llvm.eh.actions
+  //        calls and indirect branches here and then delete blocks
+  //        which are no longer reachable.  That will get rid of the
+  //        handlers that we have outlined.  There is code below
+  //        that looks for allocas with no uses in the parent function.
+  //        That will only happen after the pruning is implemented.
+
+  // Remap the frame variables.
+  SmallVector<Type *, 2> StructTys;
+  StructTys.push_back(Type::getInt32Ty(F.getContext()));   // EH state
+  StructTys.push_back(Type::getInt8PtrTy(F.getContext())); // EH object
+
+  // Start the index at two since we always have the above fields at 0 and 1.
+  int Idx = 2;
+
+  // FIXME: Sort the FrameVarInfo vector by the ParentAlloca size and alignment
+  //        and add padding as necessary to provide the proper alignment.
+
+  // Map the alloca instructions to the corresponding index in the
+  // frame allocation structure.  If any alloca is used only in a single
+  // handler and is not used in the parent frame after outlining, it will
+  // be assigned an index of -1, meaning the handler can keep its
+  // "temporary" alloca and the original alloca can be erased from the
+  // parent function.  If we later encounter this alloca in a second
+  // handler, we will assign it a place in the frame allocation structure
+  // at that time.  Since the instruction replacement doesn't happen until
+  // all the entries in the HandlerData have been processed this isn't a
+  // problem.
+  for (auto &VarInfoEntry : FrameVarInfo) {
+    AllocaInst *ParentAlloca = VarInfoEntry.first;
+    HandlerAllocas &AllocaInfo = VarInfoEntry.second;
+
+    // If the instruction still has uses in the parent function or if it is
+    // referenced by more than one handler, add it to the frame allocation
+    // structure.
+    if (ParentAlloca->getNumUses() != 0 || AllocaInfo.Allocas.size() > 1) {
+      Type *VarTy = ParentAlloca->getAllocatedType();
+      StructTys.push_back(VarTy);
+      AllocaInfo.ParentFrameAllocationIndex = Idx++;
+    } else {
+      // If the variable is not used in the parent frame and it is only used
+      // in one handler, the alloca can be removed from the parent frame
+      // and the handler will keep its "temporary" alloca to define the value.
+      // An element index of -1 is used to indicate this condition.
+      AllocaInfo.ParentFrameAllocationIndex = -1;
+    }
+  }
+
+  // Having filled the StructTys vector and assigned an index to each element,
+  // we can now create the structure.
+  StructType *EHDataStructTy = StructType::create(
+      F.getContext(), StructTys, "struct." + F.getName().str() + ".ehdata");
+  IRBuilder<> Builder(F.getParent()->getContext());
+
+  // Create a frame allocation.
+  Module *M = F.getParent();
+  LLVMContext &Context = M->getContext();
+  BasicBlock *Entry = &F.getEntryBlock();
+  Builder.SetInsertPoint(Entry->getFirstInsertionPt());
+  Function *FrameAllocFn =
+      Intrinsic::getDeclaration(M, Intrinsic::frameallocate);
+  uint64_t EHAllocSize = M->getDataLayout()->getTypeAllocSize(EHDataStructTy);
+  Value *FrameAllocArgs[] = {
+      ConstantInt::get(Type::getInt32Ty(Context), EHAllocSize)};
+  CallInst *FrameAlloc =
+      Builder.CreateCall(FrameAllocFn, FrameAllocArgs, "frame.alloc");
+
+  Value *FrameEHData = Builder.CreateBitCast(
+      FrameAlloc, EHDataStructTy->getPointerTo(), "eh.data");
+
+  // Now visit each handler that is using the structure and bitcast its EHAlloc
+  // value to be a pointer to the frame alloc structure.
+  DenseMap<Function *, Value *> EHDataMap;
+  for (CallInst *EHAlloc : HandlerAllocs) {
+    // The EHAlloc has no uses at this time, so we need to just insert the
+    // cast before the next instruction. There is always a next instruction.
+    BasicBlock::iterator II = EHAlloc;
+    ++II;
+    Builder.SetInsertPoint(cast<Instruction>(II));
+    Value *EHData = Builder.CreateBitCast(
+        EHAlloc, EHDataStructTy->getPointerTo(), "eh.data");
+    EHDataMap[EHAlloc->getParent()->getParent()] = EHData;
+  }
+
+  // Next, replace the place-holder EHObjPtr allocas with GEP instructions
+  // that pull the EHObjPtr from the frame alloc structure
+  for (AllocaInst *EHObjPtr : HandlerEHObjPtrs) {
+    Value *EHData = EHDataMap[EHObjPtr->getParent()->getParent()];
+    Builder.SetInsertPoint(EHObjPtr);
+    Value *ElementPtr = Builder.CreateConstInBoundsGEP2_32(EHData, 0, 1);
+    EHObjPtr->replaceAllUsesWith(ElementPtr);
+    EHObjPtr->removeFromParent();
+    ElementPtr->takeName(EHObjPtr);
+    delete EHObjPtr;
+  }
+
+  // Finally, replace all of the temporary allocas for frame variables used in
+  // the outlined handlers and the original frame allocas with GEP instructions
+  // that get the equivalent pointer from the frame allocation struct.
+  for (auto &VarInfoEntry : FrameVarInfo) {
+    AllocaInst *ParentAlloca = VarInfoEntry.first;
+    HandlerAllocas &AllocaInfo = VarInfoEntry.second;
+    int Idx = AllocaInfo.ParentFrameAllocationIndex;
+
+    // If we have an index of -1 for this instruction, it means it isn't used
+    // outside of this handler.  In that case, we just keep the "temporary"
+    // alloca in the handler and erase the original alloca from the parent.
+    if (Idx == -1) {
+      ParentAlloca->eraseFromParent();
+    } else {
+      // Otherwise, we replace the parent alloca and all outlined allocas
+      // which map to it with GEP instructions.
+
+      // First replace the original alloca.
+      Builder.SetInsertPoint(ParentAlloca);
+      Builder.SetCurrentDebugLocation(ParentAlloca->getDebugLoc());
+      Value *ElementPtr =
+          Builder.CreateConstInBoundsGEP2_32(FrameEHData, 0, Idx);
+      ParentAlloca->replaceAllUsesWith(ElementPtr);
+      ParentAlloca->removeFromParent();
+      ElementPtr->takeName(ParentAlloca);
+      delete ParentAlloca;
+
+      // Next replace all outlined allocas that are mapped to it.
+      for (AllocaInst *TempAlloca : AllocaInfo.Allocas) {
+        Value *EHData = EHDataMap[TempAlloca->getParent()->getParent()];
+        // FIXME: Sink this GEP into the blocks where it is used.
+        Builder.SetInsertPoint(TempAlloca);
+        Builder.SetCurrentDebugLocation(TempAlloca->getDebugLoc());
+        ElementPtr = Builder.CreateConstInBoundsGEP2_32(EHData, 0, Idx);
+        TempAlloca->replaceAllUsesWith(ElementPtr);
+        TempAlloca->removeFromParent();
+        ElementPtr->takeName(TempAlloca);
+        delete TempAlloca;
+      }
+    } // end else of if (Idx == -1)
+  }   // End for each FrameVarInfo entry.
+
+  return HandlersOutlined;
+}
+
+bool WinEHPrepare::outlineCatchHandler(Function *SrcFn, Constant *SelectorType,
+                                       LandingPadInst *LPad, CallInst *&EHAlloc,
+                                       AllocaInst *&EHObjPtr,
+                                       FrameVarInfoMap &VarInfo) {
+  Module *M = SrcFn->getParent();
+  LLVMContext &Context = M->getContext();
+
+  // Create a new function to receive the handler contents.
+  Type *Int8PtrType = Type::getInt8PtrTy(Context);
+  std::vector<Type *> ArgTys;
+  ArgTys.push_back(Int8PtrType);
+  ArgTys.push_back(Int8PtrType);
+  FunctionType *FnType = FunctionType::get(Int8PtrType, ArgTys, false);
+  Function *CatchHandler = Function::Create(
+      FnType, GlobalVariable::ExternalLinkage, SrcFn->getName() + ".catch", M);
+
+  // Generate a standard prolog to setup the frame recovery structure.
+  IRBuilder<> Builder(Context);
+  BasicBlock *Entry = BasicBlock::Create(Context, "catch.entry");
+  CatchHandler->getBasicBlockList().push_front(Entry);
+  Builder.SetInsertPoint(Entry);
+  Builder.SetCurrentDebugLocation(LPad->getDebugLoc());
+
+  // The outlined handler will be called with the parent's frame pointer as
+  // its second argument. To enable the handler to access variables from
+  // the parent frame, we use that pointer to get locate a special block
+  // of memory that was allocated using llvm.eh.allocateframe for this
+  // purpose.  During the outlining process we will determine which frame
+  // variables are used in handlers and create a structure that maps these
+  // variables into the frame allocation block.
+  //
+  // The frame allocation block also contains an exception state variable
+  // used by the runtime and a pointer to the exception object pointer
+  // which will be filled in by the runtime for use in the handler.
+  Function *RecoverFrameFn =
+      Intrinsic::getDeclaration(M, Intrinsic::framerecover);
+  Value *RecoverArgs[] = {Builder.CreateBitCast(SrcFn, Int8PtrType, ""),
+                          &(CatchHandler->getArgumentList().back())};
+  EHAlloc = Builder.CreateCall(RecoverFrameFn, RecoverArgs, "eh.alloc");
+
+  // This alloca is only temporary.  We'll be replacing it once we know all the
+  // frame variables that need to go in the frame allocation structure.
+  EHObjPtr = Builder.CreateAlloca(Int8PtrType, 0, "eh.obj.ptr");
+
+  // This will give us a raw pointer to the exception object, which
+  // corresponds to the formal parameter of the catch statement.  If the
+  // handler uses this object, we will generate code during the outlining
+  // process to cast the pointer to the appropriate type and deference it
+  // as necessary.  The un-outlined landing pad code represents the
+  // exception object as the result of the llvm.eh.begincatch call.
+  Value *EHObj = Builder.CreateLoad(EHObjPtr, false, "eh.obj");
+
+  ValueToValueMapTy VMap;
+
+  // FIXME: Map other values referenced in the filter handler.
+
+  WinEHCatchDirector Director(LPad, CatchHandler, SelectorType, EHObj, VarInfo);
+
+  SmallVector<ReturnInst *, 8> Returns;
+  ClonedCodeInfo InlinedFunctionInfo;
+
+  BasicBlock::iterator II = LPad;
+
+  CloneAndPruneIntoFromInst(CatchHandler, SrcFn, ++II, VMap,
+                            /*ModuleLevelChanges=*/false, Returns, "",
+                            &InlinedFunctionInfo,
+                            SrcFn->getParent()->getDataLayout(), &Director);
+
+  // Move all the instructions in the first cloned block into our entry block.
+  BasicBlock *FirstClonedBB = std::next(Function::iterator(Entry));
+  Entry->getInstList().splice(Entry->end(), FirstClonedBB->getInstList());
+  FirstClonedBB->eraseFromParent();
+
+  return true;
+}
+
+CloningDirector::CloningAction WinEHCatchDirector::handleInstruction(
+    ValueToValueMapTy &VMap, const Instruction *Inst, BasicBlock *NewBB) {
+  // Intercept instructions which extract values from the landing pad aggregate.
+  if (auto *Extract = dyn_cast<ExtractValueInst>(Inst)) {
+    if (Extract->getAggregateOperand() == LPI) {
+      assert(Extract->getNumIndices() == 1 &&
+             "Unexpected operation: extracting both landing pad values");
+      assert((*(Extract->idx_begin()) == 0 || *(Extract->idx_begin()) == 1) &&
+             "Unexpected operation: extracting an unknown landing pad element");
+
+      if (*(Extract->idx_begin()) == 0) {
+        // Element 0 doesn't directly corresponds to anything in the WinEH
+        // scheme.
+        // It will be stored to a memory location, then later loaded and finally
+        // the loaded value will be used as the argument to an
+        // llvm.eh.begincatch
+        // call.  We're tracking it here so that we can skip the store and load.
+        ExtractedEHPtr = Inst;
+      } else {
+        // Element 1 corresponds to the filter selector.  We'll map it to 1 for
+        // matching purposes, but it will also probably be stored to memory and
+        // reloaded, so we need to track the instuction so that we can map the
+        // loaded value too.
+        VMap[Inst] = ConstantInt::get(SelectorIDType, 1);
+        ExtractedSelector = Inst;
+      }
+
+      // Tell the caller not to clone this instruction.
+      return CloningDirector::SkipInstruction;
+    }
+    // Other extract value instructions just get cloned.
+    return CloningDirector::CloneInstruction;
+  }
+
+  if (auto *Store = dyn_cast<StoreInst>(Inst)) {
+    // Look for and suppress stores of the extracted landingpad values.
+    const Value *StoredValue = Store->getValueOperand();
+    if (StoredValue == ExtractedEHPtr) {
+      EHPtrStoreAddr = Store->getPointerOperand();
+      return CloningDirector::SkipInstruction;
+    }
+    if (StoredValue == ExtractedSelector) {
+      SelectorStoreAddr = Store->getPointerOperand();
+      return CloningDirector::SkipInstruction;
+    }
+
+    // Any other store just gets cloned.
+    return CloningDirector::CloneInstruction;
+  }
+
+  if (auto *Load = dyn_cast<LoadInst>(Inst)) {
+    // Look for loads of (previously suppressed) landingpad values.
+    // The EHPtr load can be ignored (it should only be used as
+    // an argument to llvm.eh.begincatch), but the selector value
+    // needs to be mapped to a constant value of 1 to be used to
+    // simplify the branching to always flow to the current handler.
+    const Value *LoadAddr = Load->getPointerOperand();
+    if (LoadAddr == EHPtrStoreAddr) {
+      VMap[Inst] = UndefValue::get(Int8PtrType);
+      return CloningDirector::SkipInstruction;
+    }
+    if (LoadAddr == SelectorStoreAddr) {
+      VMap[Inst] = ConstantInt::get(SelectorIDType, 1);
+      return CloningDirector::SkipInstruction;
+    }
+
+    // Any other loads just get cloned.
+    return CloningDirector::CloneInstruction;
+  }
+
+  if (match(Inst, m_Intrinsic<Intrinsic::eh_begincatch>())) {
+    // The argument to the call is some form of the first element of the
+    // landingpad aggregate value, but that doesn't matter.  It isn't used
+    // here.
+    // The return value of this instruction, however, is used to access the
+    // EH object pointer.  We have generated an instruction to get that value
+    // from the EH alloc block, so we can just map to that here.
+    VMap[Inst] = EHObj;
+    return CloningDirector::SkipInstruction;
+  }
+  if (match(Inst, m_Intrinsic<Intrinsic::eh_endcatch>())) {
+    auto *IntrinCall = dyn_cast<IntrinsicInst>(Inst);
+    // It might be interesting to track whether or not we are inside a catch
+    // function, but that might make the algorithm more brittle than it needs
+    // to be.
+
+    // The end catch call can occur in one of two places: either in a
+    // landingpad
+    // block that is part of the catch handlers exception mechanism, or at the
+    // end of the catch block.  If it occurs in a landing pad, we must skip it
+    // and continue so that the landing pad gets cloned.
+    // FIXME: This case isn't fully supported yet and shouldn't turn up in any
+    //        of the test cases until it is.
+    if (IntrinCall->getParent()->isLandingPad())
+      return CloningDirector::SkipInstruction;
+
+    // If an end catch occurs anywhere else the next instruction should be an
+    // unconditional branch instruction that we want to replace with a return
+    // to the the address of the branch target.
+    const BasicBlock *EndCatchBB = IntrinCall->getParent();
+    const TerminatorInst *Terminator = EndCatchBB->getTerminator();
+    const BranchInst *Branch = dyn_cast<BranchInst>(Terminator);
+    assert(Branch && Branch->isUnconditional());
+    assert(std::next(BasicBlock::const_iterator(IntrinCall)) ==
+           BasicBlock::const_iterator(Branch));
+
+    ReturnInst::Create(NewBB->getContext(),
+                       BlockAddress::get(Branch->getSuccessor(0)), NewBB);
+
+    // We just added a terminator to the cloned block.
+    // Tell the caller to stop processing the current basic block so that
+    // the branch instruction will be skipped.
+    return CloningDirector::StopCloningBB;
+  }
+  if (match(Inst, m_Intrinsic<Intrinsic::eh_typeid_for>())) {
+    auto *IntrinCall = dyn_cast<IntrinsicInst>(Inst);
+    Value *Selector = IntrinCall->getArgOperand(0)->stripPointerCasts();
+    // This causes a replacement that will collapse the landing pad CFG based
+    // on the filter function we intend to match.
+    if (Selector == CurrentSelector)
+      VMap[Inst] = ConstantInt::get(SelectorIDType, 1);
+    else
+      VMap[Inst] = ConstantInt::get(SelectorIDType, 0);
+    // Tell the caller not to clone this instruction.
+    return CloningDirector::SkipInstruction;
+  }
+
+  // Continue with the default cloning behavior.
+  return CloningDirector::CloneInstruction;
+}
+
+WinEHFrameVariableMaterializer::WinEHFrameVariableMaterializer(
+    Function *OutlinedFn, FrameVarInfoMap &FrameVarInfo)
+    : FrameVarInfo(FrameVarInfo), Builder(OutlinedFn->getContext()) {
+  Builder.SetInsertPoint(&OutlinedFn->getEntryBlock());
+  // FIXME: Do something with the FrameVarMapped so that it is shared across the
+  // function.
+}
+
+Value *WinEHFrameVariableMaterializer::materializeValueFor(Value *V) {
+  // If we're asked to materialize an alloca variable, we temporarily
+  // create a matching alloca in the outlined function.  When all the
+  // outlining is complete, we'll collect these into a structure and
+  // replace these temporary allocas with GEPs referencing the frame
+  // allocation block.
+  if (auto *AV = dyn_cast<AllocaInst>(V)) {
+    AllocaInst *NewAlloca = Builder.CreateAlloca(
+        AV->getAllocatedType(), AV->getArraySize(), AV->getName());
+    FrameVarInfo[AV].Allocas.push_back(NewAlloca);
+    return NewAlloca;
+  }
+
+// FIXME: Do PHI nodes need special handling?
+
+// FIXME: Are there other cases we can handle better?  GEP, ExtractValue, etc.
+
+// FIXME: This doesn't work during cloning because it finds an instruction
+//        in the use list that isn't yet part of a basic block.
+#if 0
+  // If we're asked to remap some other instruction, we'll need to
+  // spill it to an alloca variable in the parent function and add a
+  // temporary alloca in the outlined function to be processed as
+  // described above.
+  Instruction *Inst = dyn_cast<Instruction>(V);
+  if (Inst) {
+    AllocaInst *Spill = DemoteRegToStack(*Inst, true);
+    AllocaInst *NewAlloca = Builder.CreateAlloca(Spill->getAllocatedType(),
+                                                 Spill->getArraySize());
+    FrameVarMap[AV] = NewAlloca;
+    return NewAlloca;
+  }
+#endif
+
+  return nullptr;
+}
commit	ebe69fe11e48d322045d5949c83283927a0d790b	[log] [tgz]
author	Stephen Hines <srhines@google.com>	Mon Mar 23 12:10:34 2015 -0700
committer	Stephen Hines <srhines@google.com>	Mon Mar 23 12:10:34 2015 -0700
tree	c92f1907a6b8006628a4b01615f38264d29834ea
parent	b7d2e72b02a4cb8034f32f8247a2558d2434e121 [diff]