186 files changed, 13421 insertions, 6660 deletions
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index 444f618d8b8c..f64b775a8b77 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -232,7 +232,7 @@ bool AggressiveAntiDepBreaker::IsImplicitDefUse(MachineInstr &MI,
   if (!MO.isReg() || !MO.isImplicit())
     return false;
 
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
   if (Reg == 0)
     return false;
 
@@ -252,7 +252,7 @@ void AggressiveAntiDepBreaker::GetPassthruRegs(
     if (!MO.isReg()) continue;
     if ((MO.isDef() && MI.isRegTiedToUseOperand(i)) ||
         IsImplicitDefUse(MI, MO)) {
-      const unsigned Reg = MO.getReg();
+      const Register Reg = MO.getReg();
       for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
            SubRegs.isValid(); ++SubRegs)
         PassthruRegs.insert(*SubRegs);
@@ -365,7 +365,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isDef()) continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0) continue;
 
     HandleLastUse(Reg, Count + 1, "", "\tDead Def: ", "\n");
@@ -375,7 +375,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isDef()) continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0) continue;
 
     LLVM_DEBUG(dbgs() << " " << printReg(Reg, TRI) << "=g"
@@ -418,7 +418,7 @@ void AggressiveAntiDepBreaker::PrescanInstruction(
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isDef()) continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0) continue;
     // Ignore KILLs and passthru registers for liveness...
     if (MI.isKill() || (PassthruRegs.count(Reg) != 0))
@@ -471,7 +471,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI,
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isUse()) continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0) continue;
 
     LLVM_DEBUG(dbgs() << " " << printReg(Reg, TRI) << "=g"
@@ -506,7 +506,7 @@ void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI,
     for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MI.getOperand(i);
       if (!MO.isReg()) continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (Reg == 0) continue;
 
       if (FirstReg != 0) {
@@ -790,7 +790,7 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
         CriticalPathSU = SU;
       }
     }
-
+    assert(CriticalPathSU && "Failed to find SUnit critical path");
     CriticalPathMI = CriticalPathSU->getInstr();
   }
 
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index d158e70b86ac..4f24f077d120 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -309,7 +309,8 @@ static const Value *getNoopInput(const Value *V,
         NoopInput = Op;
     } else if (isa<TruncInst>(I) &&
                TLI.allowTruncateForTailCall(Op->getType(), I->getType())) {
-      DataBits = std::min(DataBits, I->getType()->getPrimitiveSizeInBits());
+      DataBits = std::min((uint64_t)DataBits,
+                         I->getType()->getPrimitiveSizeInBits().getFixedSize());
       NoopInput = Op;
     } else if (auto CS = ImmutableCallSite(I)) {
       const Value *ReturnedOp = CS.getReturnedArgOperand();
@@ -523,7 +524,8 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) {
   // longjmp on x86), it can end up causing miscompilation that has not
   // been fully understood.
   if (!Ret &&
-      (!TM.Options.GuaranteedTailCallOpt || !isa<UnreachableInst>(Term)))
+      ((!TM.Options.GuaranteedTailCallOpt &&
+        CS.getCallingConv() != CallingConv::Tail) || !isa<UnreachableInst>(Term)))
     return false;
 
   // If I will have a chain, make sure no other instruction that will have a
@@ -536,9 +538,11 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) {
       // Debug info intrinsics do not get in the way of tail call optimization.
       if (isa<DbgInfoIntrinsic>(BBI))
         continue;
-      // A lifetime end intrinsic should not stop tail call optimization.
+      // A lifetime end or assume intrinsic should not stop tail call
+      // optimization.
       if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(BBI))
-        if (II->getIntrinsicID() == Intrinsic::lifetime_end)
+        if (II->getIntrinsicID() == Intrinsic::lifetime_end ||
+            II->getIntrinsicID() == Intrinsic::assume)
           continue;
       if (BBI->mayHaveSideEffects() || BBI->mayReadFromMemory() ||
           !isSafeToSpeculativelyExecute(&*BBI))
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 54f6cc2d5571..73c53d6c4af5 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -91,10 +91,12 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSymbolXCOFF.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/SectionKind.h"
@@ -159,30 +161,30 @@ static gcp_map_type &getGCMap(void *&P) {
   return *(gcp_map_type*)P;
 }
 
-/// getGVAlignmentLog2 - Return the alignment to use for the specified global
-/// value in log2 form.  This rounds up to the preferred alignment if possible
-/// and legal.
-static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &DL,
-                                   unsigned InBits = 0) {
-  unsigned NumBits = 0;
+/// getGVAlignment - Return the alignment to use for the specified global
+/// value.  This rounds up to the preferred alignment if possible and legal.
+Align AsmPrinter::getGVAlignment(const GlobalValue *GV, const DataLayout &DL,
+                                 Align InAlign) {
+  Align Alignment;
   if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
-    NumBits = DL.getPreferredAlignmentLog(GVar);
+    Alignment = Align(DL.getPreferredAlignment(GVar));
 
-  // If InBits is specified, round it to it.
-  if (InBits > NumBits)
-    NumBits = InBits;
+  // If InAlign is specified, round it to it.
+  if (InAlign > Alignment)
+    Alignment = InAlign;
 
   // If the GV has a specified alignment, take it into account.
-  if (GV->getAlignment() == 0)
-    return NumBits;
+  const MaybeAlign GVAlign(GV->getAlignment());
+  if (!GVAlign)
+    return Alignment;
 
-  unsigned GVAlign = Log2_32(GV->getAlignment());
+  assert(GVAlign && "GVAlign must be set");
 
   // If the GVAlign is larger than NumBits, or if we are required to obey
   // NumBits because the GV has an assigned section, obey it.
-  if (GVAlign > NumBits || GV->hasSection())
-    NumBits = GVAlign;
-  return NumBits;
+  if (*GVAlign > Alignment || GV->hasSection())
+    Alignment = *GVAlign;
+  return Alignment;
 }
 
 AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr<MCStreamer> Streamer)
@@ -248,13 +250,14 @@ const MCSection *AsmPrinter::getCurrentSection() const {
 void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   MachineFunctionPass::getAnalysisUsage(AU);
-  AU.addRequired<MachineModuleInfo>();
+  AU.addRequired<MachineModuleInfoWrapperPass>();
   AU.addRequired<MachineOptimizationRemarkEmitterPass>();
   AU.addRequired<GCModuleInfo>();
 }
 
 bool AsmPrinter::doInitialization(Module &M) {
-  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
+  MMI = MMIWP ? &MMIWP->getMMI() : nullptr;
 
   // Initialize TargetLoweringObjectFile.
   const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
@@ -311,7 +314,7 @@ bool AsmPrinter::doInitialization(Module &M) {
   if (MAI->doesSupportDebugInformation()) {
     bool EmitCodeView = MMI->getModule()->getCodeViewFlag();
     if (EmitCodeView && TM.getTargetTriple().isOSWindows()) {
-      Handlers.emplace_back(llvm::make_unique<CodeViewDebug>(this),
+      Handlers.emplace_back(std::make_unique<CodeViewDebug>(this),
                             DbgTimerName, DbgTimerDescription,
                             CodeViewLineTablesGroupName,
                             CodeViewLineTablesGroupDescription);
@@ -380,7 +383,7 @@ bool AsmPrinter::doInitialization(Module &M) {
 
   if (mdconst::extract_or_null<ConstantInt>(
           MMI->getModule()->getModuleFlag("cfguardtable")))
-    Handlers.emplace_back(llvm::make_unique<WinCFGuard>(this), CFGuardName,
+    Handlers.emplace_back(std::make_unique<WinCFGuard>(this), CFGuardName,
                           CFGuardDescription, DWARFGroupName,
                           DWARFGroupDescription);
 
@@ -425,7 +428,10 @@ void AsmPrinter::EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const {
     OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Global);
     return;
   case GlobalValue::PrivateLinkage:
+    return;
   case GlobalValue::InternalLinkage:
+    if (MAI->hasDotLGloblDirective())
+      OutStreamer->EmitSymbolAttribute(GVSym, MCSA_LGlobal);
     return;
   case GlobalValue::AppendingLinkage:
   case GlobalValue::AvailableExternallyLinkage:
@@ -501,7 +507,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   // If the alignment is specified, we *must* obey it.  Overaligning a global
   // with a specified alignment is a prompt way to break globals emitted to
   // sections and expected to be contiguous (e.g. ObjC metadata).
-  unsigned AlignLog = getGVAlignmentLog2(GV, DL);
+  const Align Alignment = getGVAlignment(GV, DL);
 
   for (const HandlerInfo &HI : Handlers) {
     NamedRegionTimer T(HI.TimerName, HI.TimerDescription,
@@ -513,12 +519,11 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   // Handle common symbols
   if (GVKind.isCommon()) {
     if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
-    unsigned Align = 1 << AlignLog;
-    if (!getObjFileLowering().getCommDirectiveSupportsAlignment())
-      Align = 0;
-
     // .comm _foo, 42, 4
-    OutStreamer->EmitCommonSymbol(GVSym, Size, Align);
+    const bool SupportsAlignment =
+        getObjFileLowering().getCommDirectiveSupportsAlignment();
+    OutStreamer->EmitCommonSymbol(GVSym, Size,
+                                  SupportsAlignment ? Alignment.value() : 0);
     return;
   }
 
@@ -531,10 +536,9 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
       TheSection->isVirtualSection()) {
     if (Size == 0)
       Size = 1; // zerofill of 0 bytes is undefined.
-    unsigned Align = 1 << AlignLog;
     EmitLinkage(GV, GVSym);
     // .zerofill __DATA, __bss, _foo, 400, 5
-    OutStreamer->EmitZerofill(TheSection, GVSym, Size, Align);
+    OutStreamer->EmitZerofill(TheSection, GVSym, Size, Alignment.value());
     return;
   }
 
@@ -544,7 +548,6 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
       getObjFileLowering().getBSSSection() == TheSection) {
     if (Size == 0)
       Size = 1; // .comm Foo, 0 is undefined, avoid it.
-    unsigned Align = 1 << AlignLog;
 
     // Use .lcomm only if it supports user-specified alignment.
     // Otherwise, while it would still be correct to use .lcomm in some
@@ -554,17 +557,17 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     // Prefer to simply fall back to .local / .comm in this case.
     if (MAI->getLCOMMDirectiveAlignmentType() != LCOMM::NoAlignment) {
       // .lcomm _foo, 42
-      OutStreamer->EmitLocalCommonSymbol(GVSym, Size, Align);
+      OutStreamer->EmitLocalCommonSymbol(GVSym, Size, Alignment.value());
       return;
     }
 
-    if (!getObjFileLowering().getCommDirectiveSupportsAlignment())
-      Align = 0;
-
     // .local _foo
     OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Local);
     // .comm _foo, 42, 4
-    OutStreamer->EmitCommonSymbol(GVSym, Size, Align);
+    const bool SupportsAlignment =
+        getObjFileLowering().getCommDirectiveSupportsAlignment();
+    OutStreamer->EmitCommonSymbol(GVSym, Size,
+                                  SupportsAlignment ? Alignment.value() : 0);
     return;
   }
 
@@ -585,11 +588,11 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
     if (GVKind.isThreadBSS()) {
       TheSection = getObjFileLowering().getTLSBSSSection();
-      OutStreamer->EmitTBSSSymbol(TheSection, MangSym, Size, 1 << AlignLog);
+      OutStreamer->EmitTBSSSymbol(TheSection, MangSym, Size, Alignment.value());
     } else if (GVKind.isThreadData()) {
       OutStreamer->SwitchSection(TheSection);
 
-      EmitAlignment(AlignLog, GV);
+      EmitAlignment(Alignment, GV);
       OutStreamer->EmitLabel(MangSym);
 
       EmitGlobalConstant(GV->getParent()->getDataLayout(),
@@ -625,7 +628,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   OutStreamer->SwitchSection(TheSection);
 
   EmitLinkage(GV, EmittedInitSym);
-  EmitAlignment(AlignLog, GV);
+  EmitAlignment(Alignment, GV);
 
   OutStreamer->EmitLabel(EmittedInitSym);
 
@@ -664,6 +667,10 @@ void AsmPrinter::EmitFunctionHeader() {
   OutStreamer->SwitchSection(getObjFileLowering().SectionForGlobal(&F, TM));
   EmitVisibility(CurrentFnSym, F.getVisibility());
 
+  if (MAI->needsFunctionDescriptors() &&
+      F.getLinkage() != GlobalValue::InternalLinkage)
+    EmitLinkage(&F, CurrentFnDescSym);
+
   EmitLinkage(&F, CurrentFnSym);
   if (MAI->hasFunctionAlignment())
     EmitAlignment(MF->getAlignment(), &F);
@@ -699,8 +706,13 @@ void AsmPrinter::EmitFunctionHeader() {
     }
   }
 
-  // Emit the CurrentFnSym.  This is a virtual function to allow targets to
-  // do their wild and crazy things as required.
+  // Emit the function descriptor. This is a virtual function to allow targets
+  // to emit their specific function descriptor.
+  if (MAI->needsFunctionDescriptors())
+    EmitFunctionDescriptor();
+
+  // Emit the CurrentFnSym. This is a virtual function to allow targets to do
+  // their wild and crazy things as required.
   EmitFunctionEntryLabel();
 
   // If the function had address-taken blocks that got deleted, then we have
@@ -783,7 +795,7 @@ static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
 /// emitImplicitDef - This method emits the specified machine instruction
 /// that is an implicit def.
 void AsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
-  unsigned RegNo = MI->getOperand(0).getReg();
+  Register RegNo = MI->getOperand(0).getReg();
 
   SmallString<128> Str;
   raw_svector_ostream OS(Str);
@@ -910,7 +922,8 @@ static bool emitDebugLabelComment(const MachineInstr *MI, AsmPrinter &AP) {
   OS << "DEBUG_LABEL: ";
 
   const DILabel *V = MI->getDebugLabel();
-  if (auto *SP = dyn_cast<DISubprogram>(V->getScope())) {
+  if (auto *SP = dyn_cast<DISubprogram>(
+          V->getScope()->getNonLexicalBlockFileScope())) {
     StringRef Name = SP->getName();
     if (!Name.empty())
       OS << Name << ":";
@@ -1024,7 +1037,7 @@ void AsmPrinter::EmitFunctionBody() {
     // Get MachineDominatorTree or compute it on the fly if it's unavailable
     MDT = getAnalysisIfAvailable<MachineDominatorTree>();
     if (!MDT) {
-      OwnedMDT = make_unique<MachineDominatorTree>();
+      OwnedMDT = std::make_unique<MachineDominatorTree>();
       OwnedMDT->getBase().recalculate(*MF);
       MDT = OwnedMDT.get();
     }
@@ -1032,7 +1045,7 @@ void AsmPrinter::EmitFunctionBody() {
     // Get MachineLoopInfo or compute it on the fly if it's unavailable
     MLI = getAnalysisIfAvailable<MachineLoopInfo>();
     if (!MLI) {
-      OwnedMLI = make_unique<MachineLoopInfo>();
+      OwnedMLI = std::make_unique<MachineLoopInfo>();
       OwnedMLI->getBase().analyze(MDT->getBase());
       MLI = OwnedMLI.get();
     }
@@ -1052,9 +1065,13 @@ void AsmPrinter::EmitFunctionBody() {
         ++NumInstsInFunction;
       }
 
-      // If there is a pre-instruction symbol, emit a label for it here.
+      // If there is a pre-instruction symbol, emit a label for it here. If the
+      // instruction was duplicated and the label has already been emitted,
+      // don't re-emit the same label.
+      // FIXME: Consider strengthening that to an assertion.
       if (MCSymbol *S = MI.getPreInstrSymbol())
-        OutStreamer->EmitLabel(S);
+        if (S->isUndefined())
+          OutStreamer->EmitLabel(S);
 
       if (ShouldPrintDebugScopes) {
         for (const HandlerInfo &HI : Handlers) {
@@ -1107,9 +1124,13 @@ void AsmPrinter::EmitFunctionBody() {
         break;
       }
 
-      // If there is a post-instruction symbol, emit a label for it here.
+      // If there is a post-instruction symbol, emit a label for it here.  If
+      // the instruction was duplicated and the label has already been emitted,
+      // don't re-emit the same label.
+      // FIXME: Consider strengthening that to an assertion.
       if (MCSymbol *S = MI.getPostInstrSymbol())
-        OutStreamer->EmitLabel(S);
+        if (S->isUndefined())
+          OutStreamer->EmitLabel(S);
 
       if (ShouldPrintDebugScopes) {
         for (const HandlerInfo &HI : Handlers) {
@@ -1313,11 +1334,10 @@ void AsmPrinter::emitGlobalIndirectSymbol(Module &M,
 
   // Set the symbol type to function if the alias has a function type.
   // This affects codegen when the aliasee is not a function.
-  if (IsFunction) {
-    OutStreamer->EmitSymbolAttribute(Name, MCSA_ELF_TypeFunction);
-    if (isa<GlobalIFunc>(GIS))
-      OutStreamer->EmitSymbolAttribute(Name, MCSA_ELF_TypeIndFunction);
-  }
+  if (IsFunction)
+    OutStreamer->EmitSymbolAttribute(Name, isa<GlobalIFunc>(GIS)
+                                               ? MCSA_ELF_TypeIndFunction
+                                               : MCSA_ELF_TypeFunction);
 
   EmitVisibility(Name, GIS.getVisibility());
 
@@ -1349,60 +1369,28 @@ void AsmPrinter::emitRemarksSection(Module &M) {
   RemarkStreamer *RS = M.getContext().getRemarkStreamer();
   if (!RS)
     return;
-  const remarks::Serializer &Serializer = RS->getSerializer();
+  remarks::RemarkSerializer &RemarkSerializer = RS->getSerializer();
+
+  Optional<SmallString<128>> Filename;
+  if (Optional<StringRef> FilenameRef = RS->getFilename()) {
+    Filename = *FilenameRef;
+    sys::fs::make_absolute(*Filename);
+    assert(!Filename->empty() && "The filename can't be empty.");
+  }
+
+  std::string Buf;
+  raw_string_ostream OS(Buf);
+  std::unique_ptr<remarks::MetaSerializer> MetaSerializer =
+      Filename ? RemarkSerializer.metaSerializer(OS, StringRef(*Filename))
+               : RemarkSerializer.metaSerializer(OS);
+  MetaSerializer->emit();
 
   // Switch to the right section: .remarks/__remarks.
   MCSection *RemarksSection =
       OutContext.getObjectFileInfo()->getRemarksSection();
   OutStreamer->SwitchSection(RemarksSection);
 
-  // Emit the magic number.
-  OutStreamer->EmitBytes(remarks::Magic);
-  // Explicitly emit a '\0'.
-  OutStreamer->EmitIntValue(/*Value=*/0, /*Size=*/1);
-
-  // Emit the version number: little-endian uint64_t.
-  // The version number is located at the offset 0x0 in the section.
-  std::array<char, 8> Version;
-  support::endian::write64le(Version.data(), remarks::Version);
-  OutStreamer->EmitBinaryData(StringRef(Version.data(), Version.size()));
-
-  // Emit the string table in the section.
-  // Note: we need to use the streamer here to emit it in the section. We can't
-  // just use the serialize function with a raw_ostream because of the way
-  // MCStreamers work.
-  uint64_t StrTabSize =
-      Serializer.StrTab ? Serializer.StrTab->SerializedSize : 0;
-  // Emit the total size of the string table (the size itself excluded):
-  // little-endian uint64_t.
-  // The total size is located after the version number.
-  // Note: even if no string table is used, emit 0.
-  std::array<char, 8> StrTabSizeBuf;
-  support::endian::write64le(StrTabSizeBuf.data(), StrTabSize);
-  OutStreamer->EmitBinaryData(
-      StringRef(StrTabSizeBuf.data(), StrTabSizeBuf.size()));
-
-  if (const Optional<remarks::StringTable> &StrTab = Serializer.StrTab) {
-    std::vector<StringRef> StrTabStrings = StrTab->serialize();
-    // Emit a list of null-terminated strings.
-    // Note: the order is important here: the ID used in the remarks corresponds
-    // to the position of the string in the section.
-    for (StringRef Str : StrTabStrings) {
-      OutStreamer->EmitBytes(Str);
-      // Explicitly emit a '\0'.
-      OutStreamer->EmitIntValue(/*Value=*/0, /*Size=*/1);
-    }
-  }
-
-  // Emit the null-terminated absolute path to the remark file.
-  // The path is located at the offset 0x4 in the section.
-  StringRef FilenameRef = RS->getFilename();
-  SmallString<128> Filename = FilenameRef;
-  sys::fs::make_absolute(Filename);
-  assert(!Filename.empty() && "The filename can't be empty.");
-  OutStreamer->EmitBytes(Filename);
-  // Explicitly emit a '\0'.
-  OutStreamer->EmitIntValue(/*Value=*/0, /*Size=*/1);
+  OutStreamer->EmitBinaryData(OS.str());
 }
 
 bool AsmPrinter::doFinalization(Module &M) {
@@ -1455,7 +1443,7 @@ bool AsmPrinter::doFinalization(Module &M) {
       OutStreamer->SwitchSection(TLOF.getDataSection());
       const DataLayout &DL = M.getDataLayout();
 
-      EmitAlignment(Log2_32(DL.getPointerSize()));
+      EmitAlignment(Align(DL.getPointerSize()));
       for (const auto &Stub : Stubs) {
         OutStreamer->EmitLabel(Stub.first);
         OutStreamer->EmitSymbolValue(Stub.second.getPointer(),
@@ -1482,7 +1470,7 @@ bool AsmPrinter::doFinalization(Module &M) {
                 COFF::IMAGE_SCN_LNK_COMDAT,
             SectionKind::getReadOnly(), Stub.first->getName(),
             COFF::IMAGE_COMDAT_SELECT_ANY));
-        EmitAlignment(Log2_32(DL.getPointerSize()));
+        EmitAlignment(Align(DL.getPointerSize()));
         OutStreamer->EmitSymbolAttribute(Stub.first, MCSA_Global);
         OutStreamer->EmitLabel(Stub.first);
         OutStreamer->EmitSymbolValue(Stub.second.getPointer(),
@@ -1607,8 +1595,7 @@ bool AsmPrinter::doFinalization(Module &M) {
              "expected llvm.used to be an array type");
       if (const auto *A = cast<ConstantArray>(LU->getInitializer())) {
         for (const Value *Op : A->operands()) {
-          const auto *GV =
-              cast<GlobalValue>(Op->stripPointerCastsNoFollowAliases());
+          const auto *GV = cast<GlobalValue>(Op->stripPointerCasts());
           // Global symbols with internal or private linkage are not visible to
           // the linker, and thus would cause an error when the linker tried to
           // preserve the symbol due to the `/include:` directive.
@@ -1679,8 +1666,27 @@ MCSymbol *AsmPrinter::getCurExceptionSym() {
 
 void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   this->MF = &MF;
+
   // Get the function symbol.
-  CurrentFnSym = getSymbol(&MF.getFunction());
+  if (MAI->needsFunctionDescriptors()) {
+    assert(TM.getTargetTriple().isOSAIX() && "Function descriptor is only"
+                                             " supported on AIX.");
+    assert(CurrentFnDescSym && "The function descriptor symbol needs to be"
+		                           " initalized first.");
+
+    // Get the function entry point symbol.
+    CurrentFnSym =
+        OutContext.getOrCreateSymbol("." + CurrentFnDescSym->getName());
+
+    const Function &F = MF.getFunction();
+    MCSectionXCOFF *FnEntryPointSec =
+        cast<MCSectionXCOFF>(getObjFileLowering().SectionForGlobal(&F, TM));
+    // Set the containing csect.
+    cast<MCSymbolXCOFF>(CurrentFnSym)->setContainingCsect(FnEntryPointSec);
+  } else {
+    CurrentFnSym = getSymbol(&MF.getFunction());
+  }
+
   CurrentFnSymForSize = CurrentFnSym;
   CurrentFnBegin = nullptr;
   CurExceptionSym = nullptr;
@@ -1765,7 +1771,7 @@ void AsmPrinter::EmitConstantPool() {
 
       if (CurSection != CPSections[i].S) {
         OutStreamer->SwitchSection(CPSections[i].S);
-        EmitAlignment(Log2_32(CPSections[i].Alignment));
+        EmitAlignment(Align(CPSections[i].Alignment));
         CurSection = CPSections[i].S;
         Offset = 0;
       }
@@ -1812,7 +1818,7 @@ void AsmPrinter::EmitJumpTableInfo() {
     OutStreamer->SwitchSection(ReadOnlySection);
   }
 
-  EmitAlignment(Log2_32(MJTI->getEntryAlignment(DL)));
+  EmitAlignment(Align(MJTI->getEntryAlignment(DL)));
 
   // Jump tables in code sections are marked with a data_region directive
   // where that's supported.
@@ -2025,10 +2031,10 @@ void AsmPrinter::EmitXXStructorList(const DataLayout &DL, const Constant *List,
   }
 
   // Emit the function pointers in the target-specific order
-  unsigned Align = Log2_32(DL.getPointerPrefAlignment());
   llvm::stable_sort(Structors, [](const Structor &L, const Structor &R) {
     return L.Priority < R.Priority;
   });
+  const Align Align = DL.getPointerPrefAlignment();
   for (Structor &S : Structors) {
     const TargetLoweringObjectFile &Obj = getObjFileLowering();
     const MCSymbol *KeySym = nullptr;
@@ -2149,23 +2155,20 @@ void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
 //===----------------------------------------------------------------------===//
 
 // EmitAlignment - Emit an alignment directive to the specified power of
-// two boundary.  For example, if you pass in 3 here, you will get an 8
-// byte alignment.  If a global value is specified, and if that global has
+// two boundary.  If a global value is specified, and if that global has
 // an explicit alignment requested, it will override the alignment request
 // if required for correctness.
-void AsmPrinter::EmitAlignment(unsigned NumBits, const GlobalObject *GV) const {
+void AsmPrinter::EmitAlignment(Align Alignment, const GlobalObject *GV) const {
   if (GV)
-    NumBits = getGVAlignmentLog2(GV, GV->getParent()->getDataLayout(), NumBits);
+    Alignment = getGVAlignment(GV, GV->getParent()->getDataLayout(), Alignment);
 
-  if (NumBits == 0) return;   // 1-byte aligned: no need to emit alignment.
+  if (Alignment == Align::None())
+    return; // 1-byte aligned: no need to emit alignment.
 
-  assert(NumBits <
-             static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
-         "undefined behavior");
   if (getCurrentSection()->getKind().isText())
-    OutStreamer->EmitCodeAlignment(1u << NumBits);
+    OutStreamer->EmitCodeAlignment(Alignment.value());
   else
-    OutStreamer->EmitValueToAlignment(1u << NumBits);
+    OutStreamer->EmitValueToAlignment(Alignment.value());
 }
 
 //===----------------------------------------------------------------------===//
@@ -2481,6 +2484,7 @@ static void emitGlobalConstantStruct(const DataLayout &DL,
 }
 
 static void emitGlobalConstantFP(APFloat APF, Type *ET, AsmPrinter &AP) {
+  assert(ET && "Unknown float type");
   APInt API = APF.bitcastToAPInt();
 
   // First print a comment with what we think the original floating-point value
@@ -2488,11 +2492,7 @@ static void emitGlobalConstantFP(APFloat APF, Type *ET, AsmPrinter &AP) {
   if (AP.isVerbose()) {
     SmallString<8> StrVal;
     APF.toString(StrVal);
-
-    if (ET)
-      ET->print(AP.OutStreamer->GetCommentOS());
-    else
-      AP.OutStreamer->GetCommentOS() << "Printing <null> Type";
+    ET->print(AP.OutStreamer->GetCommentOS());
     AP.OutStreamer->GetCommentOS() << ' ' << StrVal << '\n';
   }
 
@@ -2670,7 +2670,7 @@ static void handleIndirectSymViaGOTPCRel(AsmPrinter &AP, const MCExpr **ME,
   const GlobalValue *FinalGV = dyn_cast<GlobalValue>(GV->getOperand(0));
   const MCSymbol *FinalSym = AP.getSymbol(FinalGV);
   *ME = AP.getObjFileLowering().getIndirectSymViaGOTPCRel(
-      FinalSym, MV, Offset, AP.MMI, *AP.OutStreamer);
+      FinalGV, FinalSym, MV, Offset, AP.MMI, *AP.OutStreamer);
 
   // Update GOT equivalent usage information
   --NumUses;
@@ -2930,7 +2930,7 @@ void AsmPrinter::setupCodePaddingContext(const MachineBasicBlock &MBB,
 /// EmitBasicBlockStart - This method prints the label for the specified
 /// MachineBasicBlock, an alignment (if present) and a comment describing
 /// it if appropriate.
-void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
+void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
   // End the previous funclet and start a new one.
   if (MBB.isEHFuncletEntry()) {
     for (const HandlerInfo &HI : Handlers) {
@@ -2940,8 +2940,9 @@ void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
   }
 
   // Emit an alignment directive for this block, if needed.
-  if (unsigned Align = MBB.getAlignment())
-    EmitAlignment(Align);
+  const Align Alignment = MBB.getAlignment();
+  if (Alignment != Align::None())
+    EmitAlignment(Alignment);
   MCCodePaddingContext Context;
   setupCodePaddingContext(MBB, Context);
   OutStreamer->EmitCodePaddingBasicBlockStart(Context);
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 7721e996aca5..420df26a2b8b 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -72,7 +72,7 @@ static void srcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) {
 unsigned AsmPrinter::addInlineAsmDiagBuffer(StringRef AsmStr,
                                             const MDNode *LocMDNode) const {
   if (!DiagInfo) {
-    DiagInfo = make_unique<SrcMgrDiagInfo>();
+    DiagInfo = std::make_unique<SrcMgrDiagInfo>();
 
     MCContext &Context = MMI->getContext();
     Context.setInlineSourceManager(&DiagInfo->SrcMgr);
@@ -432,6 +432,7 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
               const BlockAddress *BA = MI->getOperand(OpNo).getBlockAddress();
               MCSymbol *Sym = AP->GetBlockAddressSymbol(BA);
               Sym->print(OS, AP->MAI);
+              MMI->getContext().registerInlineAsmLabel(Sym);
             } else if (MI->getOperand(OpNo).isMBB()) {
               const MCSymbol *Sym = MI->getOperand(OpNo).getMBB()->getSymbol();
               Sym->print(OS, AP->MAI);
diff --git a/lib/CodeGen/AsmPrinter/ByteStreamer.h b/lib/CodeGen/AsmPrinter/ByteStreamer.h
index db2ff458eb2e..09f7496cd4ef 100644
--- a/lib/CodeGen/AsmPrinter/ByteStreamer.h
+++ b/lib/CodeGen/AsmPrinter/ByteStreamer.h
@@ -73,18 +73,18 @@ class HashingByteStreamer final : public ByteStreamer {
 class BufferByteStreamer final : public ByteStreamer {
 private:
   SmallVectorImpl<char> &Buffer;
-  SmallVectorImpl<std::string> &Comments;
+  std::vector<std::string> &Comments;
 
+public:
   /// Only verbose textual output needs comments.  This will be set to
   /// true for that case, and false otherwise.  If false, comments passed in to
   /// the emit methods will be ignored.
-  bool GenerateComments;
+  const bool GenerateComments;
 
-public:
   BufferByteStreamer(SmallVectorImpl<char> &Buffer,
-                     SmallVectorImpl<std::string> &Comments,
-                     bool GenerateComments)
-  : Buffer(Buffer), Comments(Comments), GenerateComments(GenerateComments) {}
+                     std::vector<std::string> &Comments, bool GenerateComments)
+      : Buffer(Buffer), Comments(Comments), GenerateComments(GenerateComments) {
+  }
   void EmitInt8(uint8_t Byte, const Twine &Comment) override {
     Buffer.push_back(Byte);
     if (GenerateComments)
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 932959c311fa..c6457f3626d1 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -98,7 +98,8 @@ using namespace llvm::codeview;
 namespace {
 class CVMCAdapter : public CodeViewRecordStreamer {
 public:
-  CVMCAdapter(MCStreamer &OS) : OS(&OS) {}
+  CVMCAdapter(MCStreamer &OS, TypeCollection &TypeTable)
+      : OS(&OS), TypeTable(TypeTable) {}
 
   void EmitBytes(StringRef Data) { OS->EmitBytes(Data); }
 
@@ -110,8 +111,24 @@ public:
 
   void AddComment(const Twine &T) { OS->AddComment(T); }
 
+  void AddRawComment(const Twine &T) { OS->emitRawComment(T); }
+
+  bool isVerboseAsm() { return OS->isVerboseAsm(); }
+
+  std::string getTypeName(TypeIndex TI) {
+    std::string TypeName;
+    if (!TI.isNoneType()) {
+      if (TI.isSimple())
+        TypeName = TypeIndex::simpleTypeName(TI);
+      else
+        TypeName = TypeTable.getTypeName(TI);
+    }
+    return TypeName;
+  }
+
 private:
   MCStreamer *OS = nullptr;
+  TypeCollection &TypeTable;
 };
 } // namespace
 
@@ -617,13 +634,6 @@ emitNullTerminatedSymbolName(MCStreamer &OS, StringRef S,
   OS.EmitBytes(NullTerminatedString);
 }
 
-static StringRef getTypeLeafName(TypeLeafKind TypeKind) {
-  for (const EnumEntry<TypeLeafKind> &EE : getTypeLeafNames())
-    if (EE.Value == TypeKind)
-      return EE.Name;
-  return "";
-}
-
 void CodeViewDebug::emitTypeInformation() {
   if (TypeTable.empty())
     return;
@@ -632,30 +642,11 @@ void CodeViewDebug::emitTypeInformation() {
   OS.SwitchSection(Asm->getObjFileLowering().getCOFFDebugTypesSection());
   emitCodeViewMagicVersion();
 
-  SmallString<8> CommentPrefix;
-  if (OS.isVerboseAsm()) {
-    CommentPrefix += '\t';
-    CommentPrefix += Asm->MAI->getCommentString();
-    CommentPrefix += ' ';
-  }
-
   TypeTableCollection Table(TypeTable.records());
-  SmallString<512> CommentBlock;
-  raw_svector_ostream CommentOS(CommentBlock);
-  std::unique_ptr<ScopedPrinter> SP;
-  std::unique_ptr<TypeDumpVisitor> TDV;
   TypeVisitorCallbackPipeline Pipeline;
 
-  if (OS.isVerboseAsm()) {
-    // To construct block comment describing the type record for readability.
-    SP = llvm::make_unique<ScopedPrinter>(CommentOS);
-    SP->setPrefix(CommentPrefix);
-    TDV = llvm::make_unique<TypeDumpVisitor>(Table, SP.get(), false);
-    Pipeline.addCallbackToPipeline(*TDV);
-  }
-
   // To emit type record using Codeview MCStreamer adapter
-  CVMCAdapter CVMCOS(OS);
+  CVMCAdapter CVMCOS(OS, Table);
   TypeRecordMapping typeMapping(CVMCOS);
   Pipeline.addCallbackToPipeline(typeMapping);
 
@@ -664,17 +655,6 @@ void CodeViewDebug::emitTypeInformation() {
     // This will fail if the record data is invalid.
     CVType Record = Table.getType(*B);
 
-    CommentBlock.clear();
-
-    auto RecordLen = Record.length();
-    auto RecordKind = Record.kind();
-    if (OS.isVerboseAsm())
-      CVMCOS.AddComment("Record length");
-    CVMCOS.EmitIntValue(RecordLen - 2, 2);
-    if (OS.isVerboseAsm())
-      CVMCOS.AddComment("Record kind: " + getTypeLeafName(RecordKind));
-    CVMCOS.EmitIntValue(RecordKind, sizeof(RecordKind));
-
     Error E = codeview::visitTypeRecord(Record, *B, Pipeline);
 
     if (E) {
@@ -682,13 +662,6 @@ void CodeViewDebug::emitTypeInformation() {
       llvm_unreachable("produced malformed type record");
     }
 
-    if (OS.isVerboseAsm()) {
-      // emitRawComment will insert its own tab and comment string before
-      // the first line, so strip off our first one. It also prints its own
-      // newline.
-      OS.emitRawComment(
-          CommentOS.str().drop_front(CommentPrefix.size() - 1).rtrim());
-    }
     B = Table.getNext(*B);
   }
 }
@@ -1135,7 +1108,7 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
       if (!BeginLabel->isDefined() || !EndLabel->isDefined())
         continue;
 
-      DIType *DITy = std::get<2>(HeapAllocSite);
+      const DIType *DITy = std::get<2>(HeapAllocSite);
       MCSymbol *HeapAllocEnd = beginSymbolRecord(SymbolKind::S_HEAPALLOCSITE);
       OS.AddComment("Call site offset");
       OS.EmitCOFFSecRel32(BeginLabel, /*Offset=*/0);
@@ -1363,7 +1336,7 @@ void CodeViewDebug::beginFunctionImpl(const MachineFunction *MF) {
   const TargetRegisterInfo *TRI = TSI.getRegisterInfo();
   const MachineFrameInfo &MFI = MF->getFrameInfo();
   const Function &GV = MF->getFunction();
-  auto Insertion = FnDebugInfo.insert({&GV, llvm::make_unique<FunctionInfo>()});
+  auto Insertion = FnDebugInfo.insert({&GV, std::make_unique<FunctionInfo>()});
   assert(Insertion.second && "function already has info");
   CurFn = Insertion.first->second.get();
   CurFn->FuncId = NextFuncId++;
@@ -2633,17 +2606,6 @@ void CodeViewDebug::emitLocalVariableList(const FunctionInfo &FI,
       emitLocalVariable(FI, L);
 }
 
-/// Only call this on endian-specific types like ulittle16_t and little32_t, or
-/// structs composed of them.
-template <typename T>
-static void copyBytesForDefRange(SmallString<20> &BytePrefix,
-                                 SymbolKind SymKind, const T &DefRangeHeader) {
-  BytePrefix.resize(2 + sizeof(T));
-  ulittle16_t SymKindLE = ulittle16_t(SymKind);
-  memcpy(&BytePrefix[0], &SymKindLE, 2);
-  memcpy(&BytePrefix[2], &DefRangeHeader, sizeof(T));
-}
-
 void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
                                       const LocalVariable &Var) {
   // LocalSym record, see SymbolRecord.h for more info.
@@ -2692,8 +2654,9 @@ void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
           (bool(Flags & LocalSymFlags::IsParameter)
                ? (EncFP == FI.EncodedParamFramePtrReg)
                : (EncFP == FI.EncodedLocalFramePtrReg))) {
-        little32_t FPOffset = little32_t(Offset);
-        copyBytesForDefRange(BytePrefix, S_DEFRANGE_FRAMEPOINTER_REL, FPOffset);
+        DefRangeFramePointerRelHeader DRHdr;
+        DRHdr.Offset = Offset;
+        OS.EmitCVDefRangeDirective(DefRange.Ranges, DRHdr);
       } else {
         uint16_t RegRelFlags = 0;
         if (DefRange.IsSubfield) {
@@ -2701,28 +2664,27 @@ void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
                         (DefRange.StructOffset
                          << DefRangeRegisterRelSym::OffsetInParentShift);
         }
-        DefRangeRegisterRelSym::Header DRHdr;
+        DefRangeRegisterRelHeader DRHdr;
         DRHdr.Register = Reg;
         DRHdr.Flags = RegRelFlags;
         DRHdr.BasePointerOffset = Offset;
-        copyBytesForDefRange(BytePrefix, S_DEFRANGE_REGISTER_REL, DRHdr);
+        OS.EmitCVDefRangeDirective(DefRange.Ranges, DRHdr);
       }
     } else {
       assert(DefRange.DataOffset == 0 && "unexpected offset into register");
       if (DefRange.IsSubfield) {
-        DefRangeSubfieldRegisterSym::Header DRHdr;
+        DefRangeSubfieldRegisterHeader DRHdr;
         DRHdr.Register = DefRange.CVRegister;
         DRHdr.MayHaveNoName = 0;
         DRHdr.OffsetInParent = DefRange.StructOffset;
-        copyBytesForDefRange(BytePrefix, S_DEFRANGE_SUBFIELD_REGISTER, DRHdr);
+        OS.EmitCVDefRangeDirective(DefRange.Ranges, DRHdr);
       } else {
-        DefRangeRegisterSym::Header DRHdr;
+        DefRangeRegisterHeader DRHdr;
         DRHdr.Register = DefRange.CVRegister;
         DRHdr.MayHaveNoName = 0;
-        copyBytesForDefRange(BytePrefix, S_DEFRANGE_REGISTER, DRHdr);
+        OS.EmitCVDefRangeDirective(DefRange.Ranges, DRHdr);
       }
     }
-    OS.EmitCVDefRangeDirective(DefRange.Ranges, BytePrefix);
   }
 }
 
@@ -2896,6 +2858,14 @@ void CodeViewDebug::endFunctionImpl(const MachineFunction *MF) {
   CurFn = nullptr;
 }
 
+// Usable locations are valid with non-zero line numbers. A line number of zero
+// corresponds to optimized code that doesn't have a distinct source location.
+// In this case, we try to use the previous or next source location depending on
+// the context.
+static bool isUsableDebugLoc(DebugLoc DL) {
+  return DL && DL.getLine() != 0;
+}
+
 void CodeViewDebug::beginInstruction(const MachineInstr *MI) {
   DebugHandlerBase::beginInstruction(MI);
 
@@ -2907,19 +2877,21 @@ void CodeViewDebug::beginInstruction(const MachineInstr *MI) {
   // If the first instruction of a new MBB has no location, find the first
   // instruction with a location and use that.
   DebugLoc DL = MI->getDebugLoc();
-  if (!DL && MI->getParent() != PrevInstBB) {
+  if (!isUsableDebugLoc(DL) && MI->getParent() != PrevInstBB) {
     for (const auto &NextMI : *MI->getParent()) {
       if (NextMI.isDebugInstr())
         continue;
       DL = NextMI.getDebugLoc();
-      if (DL)
+      if (isUsableDebugLoc(DL))
         break;
     }
+    // FIXME: Handle the case where the BB has no valid locations. This would
+    // probably require doing a real dataflow analysis.
   }
   PrevInstBB = MI->getParent();
 
   // If we still don't have a debug location, don't record a location.
-  if (!DL)
+  if (!isUsableDebugLoc(DL))
     return;
 
   maybeRecordLocation(DL, Asm->MF);
@@ -3026,7 +2998,7 @@ void CodeViewDebug::collectGlobalVariableInfo() {
         auto Insertion = ScopeGlobals.insert(
             {Scope, std::unique_ptr<GlobalVariableList>()});
         if (Insertion.second)
-          Insertion.first->second = llvm::make_unique<GlobalVariableList>();
+          Insertion.first->second = std::make_unique<GlobalVariableList>();
         VariableList = Insertion.first->second.get();
       } else if (GV->hasComdat())
         // Emit this global variable into a COMDAT section.
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index ce57b789d7fa..7ffd77926cf7 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -148,7 +148,8 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
     SmallVector<LexicalBlock *, 1> ChildBlocks;
 
     std::vector<std::pair<MCSymbol *, MDNode *>> Annotations;
-    std::vector<std::tuple<MCSymbol *, MCSymbol *, DIType *>> HeapAllocSites;
+    std::vector<std::tuple<MCSymbol *, MCSymbol *, const DIType *>>
+        HeapAllocSites;
 
     const MCSymbol *Begin = nullptr;
     const MCSymbol *End = nullptr;
diff --git a/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
index ddd60575b6c0..7f9d6c618ad3 100644
--- a/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
+++ b/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
@@ -41,7 +41,7 @@ using EntryIndex = DbgValueHistoryMap::EntryIndex;
 static Register isDescribedByReg(const MachineInstr &MI) {
   assert(MI.isDebugValue());
   assert(MI.getNumOperands() == 4);
-  // If the location of variable is an entry value (DW_OP_entry_value)
+  // If the location of variable is an entry value (DW_OP_LLVM_entry_value)
   // do not consider it as a register location.
   if (MI.getDebugExpression()->isEntryValue())
     return 0;
@@ -177,13 +177,13 @@ static void handleNewDebugValue(InlinedEntity Var, const MachineInstr &DV,
         IndicesToErase.push_back(Index);
         Entry.endEntry(NewIndex);
       }
-      if (unsigned Reg = isDescribedByReg(DV))
+      if (Register Reg = isDescribedByReg(DV))
         TrackedRegs[Reg] |= !Overlaps;
     }
 
     // If the new debug value is described by a register, add tracking of
     // that register if it is not already tracked.
-    if (unsigned NewReg = isDescribedByReg(DV)) {
+    if (Register NewReg = isDescribedByReg(DV)) {
       if (!TrackedRegs.count(NewReg))
         addRegDescribedVar(RegVars, NewReg, Var);
       LiveEntries[Var].insert(NewIndex);
@@ -234,7 +234,7 @@ void llvm::calculateDbgEntityHistory(const MachineFunction *MF,
                                      DbgLabelInstrMap &DbgLabels) {
   const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
   unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
-  unsigned FrameReg = TRI->getFrameRegister(*MF);
+  Register FrameReg = TRI->getFrameRegister(*MF);
   RegDescribedVarsMap RegVars;
   DbgValueEntriesMap LiveEntries;
   for (const auto &MBB : *MF) {
@@ -275,7 +275,7 @@ void llvm::calculateDbgEntityHistory(const MachineFunction *MF,
             continue;
           // If this is a virtual register, only clobber it since it doesn't
           // have aliases.
-          if (TRI->isVirtualRegister(MO.getReg()))
+          if (Register::isVirtualRegister(MO.getReg()))
             clobberRegisterUses(RegVars, MO.getReg(), DbgValues, LiveEntries,
                                 MI);
           // If this is a register def operand, it may end a debug value
@@ -296,7 +296,7 @@ void llvm::calculateDbgEntityHistory(const MachineFunction *MF,
           // Don't consider SP to be clobbered by register masks.
           for (auto It : RegVars) {
             unsigned int Reg = It.first;
-            if (Reg != SP && TRI->isPhysicalRegister(Reg) &&
+            if (Reg != SP && Register::isPhysicalRegister(Reg) &&
                 MO.clobbersPhysReg(Reg))
               RegsToClobber.push_back(Reg);
           }
diff --git a/lib/CodeGen/AsmPrinter/DebugLocStream.h b/lib/CodeGen/AsmPrinter/DebugLocStream.h
index 789291771b5a..0db86b09d19a 100644
--- a/lib/CodeGen/AsmPrinter/DebugLocStream.h
+++ b/lib/CodeGen/AsmPrinter/DebugLocStream.h
@@ -38,21 +38,18 @@ public:
         : CU(CU), EntryOffset(EntryOffset) {}
   };
   struct Entry {
-    const MCSymbol *BeginSym;
-    const MCSymbol *EndSym;
+    const MCSymbol *Begin;
+    const MCSymbol *End;
     size_t ByteOffset;
     size_t CommentOffset;
-    Entry(const MCSymbol *BeginSym, const MCSymbol *EndSym, size_t ByteOffset,
-          size_t CommentOffset)
-        : BeginSym(BeginSym), EndSym(EndSym), ByteOffset(ByteOffset),
-          CommentOffset(CommentOffset) {}
   };
 
 private:
   SmallVector<List, 4> Lists;
   SmallVector<Entry, 32> Entries;
   SmallString<256> DWARFBytes;
-  SmallVector<std::string, 32> Comments;
+  std::vector<std::string> Comments;
+  MCSymbol *Sym;
 
   /// Only verbose textual output needs comments.  This will be set to
   /// true for that case, and false otherwise.
@@ -63,6 +60,12 @@ public:
   size_t getNumLists() const { return Lists.size(); }
   const List &getList(size_t LI) const { return Lists[LI]; }
   ArrayRef<List> getLists() const { return Lists; }
+  MCSymbol *getSym() const {
+    return Sym;
+  }
+  void setSym(MCSymbol *Sym) {
+    this->Sym = Sym;
+  }
 
   class ListBuilder;
   class EntryBuilder;
@@ -93,7 +96,7 @@ private:
   /// Until the next call, bytes added to the stream will be added to this
   /// entry.
   void startEntry(const MCSymbol *BeginSym, const MCSymbol *EndSym) {
-    Entries.emplace_back(BeginSym, EndSym, DWARFBytes.size(), Comments.size());
+    Entries.push_back({BeginSym, EndSym, DWARFBytes.size(), Comments.size()});
   }
 
   /// Finalize a .debug_loc entry, deleting if it's empty.
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 9548ad9918c1..a61c98ec1c18 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -208,7 +208,7 @@ void DwarfCompileUnit::addLocationAttribute(
     if (!Loc) {
       addToAccelTable = true;
       Loc = new (DIEValueAllocator) DIELoc;
-      DwarfExpr = llvm::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc);
+      DwarfExpr = std::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc);
     }
 
     if (Expr) {
@@ -326,14 +326,13 @@ void DwarfCompileUnit::addRange(RangeSpan Range) {
   // emitted into and the subprogram was contained within. If these are the
   // same then extend our current range, otherwise add this as a new range.
   if (CURanges.empty() || !SameAsPrevCU ||
-      (&CURanges.back().getEnd()->getSection() !=
-       &Range.getEnd()->getSection())) {
+      (&CURanges.back().End->getSection() !=
+       &Range.End->getSection())) {
     CURanges.push_back(Range);
-    DD->addSectionLabel(Range.getStart());
     return;
   }
 
-  CURanges.back().setEnd(Range.getEnd());
+  CURanges.back().End = Range.End;
 }
 
 void DwarfCompileUnit::initStmtList() {
@@ -399,7 +398,7 @@ DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP) {
     } else {
       const TargetRegisterInfo *RI = Asm->MF->getSubtarget().getRegisterInfo();
       MachineLocation Location(RI->getFrameRegister(*Asm->MF));
-      if (RI->isPhysicalRegister(Location.getReg()))
+      if (Register::isPhysicalRegister(Location.getReg()))
         addAddress(*SPDie, dwarf::DW_AT_frame_base, Location);
     }
   }
@@ -468,14 +467,6 @@ void DwarfCompileUnit::constructScopeDIE(
 
 void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE,
                                          SmallVector<RangeSpan, 2> Range) {
-  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
-
-  // Emit the offset into .debug_ranges or .debug_rnglists as a relocatable
-  // label. emitDIE() will handle emitting it appropriately.
-  const MCSymbol *RangeSectionSym =
-      DD->getDwarfVersion() >= 5
-          ? TLOF.getDwarfRnglistsSection()->getBeginSymbol()
-          : TLOF.getDwarfRangesSection()->getBeginSymbol();
 
   HasRangeLists = true;
 
@@ -494,12 +485,17 @@ void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE,
   // (DW_RLE_startx_endx etc.).
   if (DD->getDwarfVersion() >= 5)
     addUInt(ScopeDIE, dwarf::DW_AT_ranges, dwarf::DW_FORM_rnglistx, Index);
-  else if (isDwoUnit())
-    addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
-                    RangeSectionSym);
-  else
-    addSectionLabel(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
-                    RangeSectionSym);
+  else {
+    const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+    const MCSymbol *RangeSectionSym =
+        TLOF.getDwarfRangesSection()->getBeginSymbol();
+    if (isDwoUnit())
+      addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
+                      RangeSectionSym);
+    else
+      addSectionLabel(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
+                      RangeSectionSym);
+  }
 }
 
 void DwarfCompileUnit::attachRangesOrLowHighPC(
@@ -507,7 +503,7 @@ void DwarfCompileUnit::attachRangesOrLowHighPC(
   if (Ranges.size() == 1 || !DD->useRangesSection()) {
     const RangeSpan &Front = Ranges.front();
     const RangeSpan &Back = Ranges.back();
-    attachLowHighPC(Die, Front.getStart(), Back.getEnd());
+    attachLowHighPC(Die, Front.Begin, Back.End);
   } else
     addScopeRangeList(Die, std::move(Ranges));
 }
@@ -517,8 +513,8 @@ void DwarfCompileUnit::attachRangesOrLowHighPC(
   SmallVector<RangeSpan, 2> List;
   List.reserve(Ranges.size());
   for (const InsnRange &R : Ranges)
-    List.push_back(RangeSpan(DD->getLabelBeforeInsn(R.first),
-                             DD->getLabelAfterInsn(R.second)));
+    List.push_back(
+        {DD->getLabelBeforeInsn(R.first), DD->getLabelAfterInsn(R.second)});
   attachRangesOrLowHighPC(Die, std::move(List));
 }
 
@@ -647,8 +643,7 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
     int Offset = TFI->getFrameIndexReference(*Asm->MF, Fragment.FI, FrameReg);
     DwarfExpr.addFragmentOffset(Expr);
     SmallVector<uint64_t, 8> Ops;
-    Ops.push_back(dwarf::DW_OP_plus_uconst);
-    Ops.push_back(Offset);
+    DIExpression::appendOffset(Ops, Offset);
     // According to
     // https://docs.nvidia.com/cuda/archive/10.0/ptx-writers-guide-to-interoperability/index.html#cuda-specific-dwarf
     // cuda-gdb requires DW_AT_address_class for all variables to be able to
@@ -892,32 +887,117 @@ void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
     ContextCU->addDIEEntry(*AbsDef, dwarf::DW_AT_object_pointer, *ObjectPointer);
 }
 
-DIE &DwarfCompileUnit::constructCallSiteEntryDIE(DIE &ScopeDIE,
-                                                 const DISubprogram &CalleeSP,
-                                                 bool IsTail,
-                                                 const MCExpr *PCOffset) {
+/// Whether to use the GNU analog for a DWARF5 tag, attribute, or location atom.
+static bool useGNUAnalogForDwarf5Feature(DwarfDebug *DD) {
+  return DD->getDwarfVersion() == 4 && DD->tuneForGDB();
+}
+
+dwarf::Tag DwarfCompileUnit::getDwarf5OrGNUTag(dwarf::Tag Tag) const {
+  if (!useGNUAnalogForDwarf5Feature(DD))
+    return Tag;
+  switch (Tag) {
+  case dwarf::DW_TAG_call_site:
+    return dwarf::DW_TAG_GNU_call_site;
+  case dwarf::DW_TAG_call_site_parameter:
+    return dwarf::DW_TAG_GNU_call_site_parameter;
+  default:
+    llvm_unreachable("DWARF5 tag with no GNU analog");
+  }
+}
+
+dwarf::Attribute
+DwarfCompileUnit::getDwarf5OrGNUAttr(dwarf::Attribute Attr) const {
+  if (!useGNUAnalogForDwarf5Feature(DD))
+    return Attr;
+  switch (Attr) {
+  case dwarf::DW_AT_call_all_calls:
+    return dwarf::DW_AT_GNU_all_call_sites;
+  case dwarf::DW_AT_call_target:
+    return dwarf::DW_AT_GNU_call_site_target;
+  case dwarf::DW_AT_call_origin:
+    return dwarf::DW_AT_abstract_origin;
+  case dwarf::DW_AT_call_pc:
+    return dwarf::DW_AT_low_pc;
+  case dwarf::DW_AT_call_value:
+    return dwarf::DW_AT_GNU_call_site_value;
+  case dwarf::DW_AT_call_tail_call:
+    return dwarf::DW_AT_GNU_tail_call;
+  default:
+    llvm_unreachable("DWARF5 attribute with no GNU analog");
+  }
+}
+
+dwarf::LocationAtom
+DwarfCompileUnit::getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const {
+  if (!useGNUAnalogForDwarf5Feature(DD))
+    return Loc;
+  switch (Loc) {
+  case dwarf::DW_OP_entry_value:
+    return dwarf::DW_OP_GNU_entry_value;
+  default:
+    llvm_unreachable("DWARF5 location atom with no GNU analog");
+  }
+}
+
+DIE &DwarfCompileUnit::constructCallSiteEntryDIE(
+    DIE &ScopeDIE, const DISubprogram *CalleeSP, bool IsTail,
+    const MCSymbol *PCAddr, const MCExpr *PCOffset, unsigned CallReg) {
   // Insert a call site entry DIE within ScopeDIE.
-  DIE &CallSiteDIE =
-      createAndAddDIE(dwarf::DW_TAG_call_site, ScopeDIE, nullptr);
+  DIE &CallSiteDIE = createAndAddDIE(getDwarf5OrGNUTag(dwarf::DW_TAG_call_site),
+                                     ScopeDIE, nullptr);
 
-  // For the purposes of showing tail call frames in backtraces, a key piece of
-  // information is DW_AT_call_origin, a pointer to the callee DIE.
-  DIE *CalleeDIE = getOrCreateSubprogramDIE(&CalleeSP);
-  assert(CalleeDIE && "Could not create DIE for call site entry origin");
-  addDIEEntry(CallSiteDIE, dwarf::DW_AT_call_origin, *CalleeDIE);
+  if (CallReg) {
+    // Indirect call.
+    addAddress(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_target),
+               MachineLocation(CallReg));
+  } else {
+    DIE *CalleeDIE = getOrCreateSubprogramDIE(CalleeSP);
+    assert(CalleeDIE && "Could not create DIE for call site entry origin");
+    addDIEEntry(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_origin),
+                *CalleeDIE);
+  }
 
-  if (IsTail) {
+  if (IsTail)
     // Attach DW_AT_call_tail_call to tail calls for standards compliance.
-    addFlag(CallSiteDIE, dwarf::DW_AT_call_tail_call);
-  } else {
-    // Attach the return PC to allow the debugger to disambiguate call paths
-    // from one function to another.
+    addFlag(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_tail_call));
+
+  // Attach the return PC to allow the debugger to disambiguate call paths
+  // from one function to another.
+  if (DD->getDwarfVersion() == 4 && DD->tuneForGDB()) {
+    assert(PCAddr && "Missing PC information for a call");
+    addLabelAddress(CallSiteDIE, dwarf::DW_AT_low_pc, PCAddr);
+  } else if (!IsTail || DD->tuneForGDB()) {
     assert(PCOffset && "Missing return PC information for a call");
     addAddressExpr(CallSiteDIE, dwarf::DW_AT_call_return_pc, PCOffset);
   }
+
   return CallSiteDIE;
 }
 
+void DwarfCompileUnit::constructCallSiteParmEntryDIEs(
+    DIE &CallSiteDIE, SmallVector<DbgCallSiteParam, 4> &Params) {
+  for (const auto &Param : Params) {
+    unsigned Register = Param.getRegister();
+    auto CallSiteDieParam =
+        DIE::get(DIEValueAllocator,
+                 getDwarf5OrGNUTag(dwarf::DW_TAG_call_site_parameter));
+    insertDIE(CallSiteDieParam);
+    addAddress(*CallSiteDieParam, dwarf::DW_AT_location,
+               MachineLocation(Register));
+
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+    DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+    DwarfExpr.setCallSiteParamValueFlag();
+
+    DwarfDebug::emitDebugLocValue(*Asm, nullptr, Param.getValue(), DwarfExpr);
+
+    addBlock(*CallSiteDieParam, getDwarf5OrGNUAttr(dwarf::DW_AT_call_value),
+             DwarfExpr.finalize());
+
+    CallSiteDIE.addChild(CallSiteDieParam);
+  }
+}
+
 DIE *DwarfCompileUnit::constructImportedEntityDIE(
     const DIImportedEntity *Module) {
   DIE *IMDie = DIE::get(DIEValueAllocator, (dwarf::Tag)Module->getTag());
@@ -997,11 +1077,11 @@ void DwarfCompileUnit::createAbstractEntity(const DINode *Node,
   assert(Scope && Scope->isAbstractScope());
   auto &Entity = getAbstractEntities()[Node];
   if (isa<const DILocalVariable>(Node)) {
-    Entity = llvm::make_unique<DbgVariable>(
+    Entity = std::make_unique<DbgVariable>(
                         cast<const DILocalVariable>(Node), nullptr /* IA */);;
     DU->addScopeVariable(Scope, cast<DbgVariable>(Entity.get()));
   } else if (isa<const DILabel>(Node)) {
-    Entity = llvm::make_unique<DbgLabel>(
+    Entity = std::make_unique<DbgLabel>(
                         cast<const DILabel>(Node), nullptr /* IA */);
     DU->addScopeLabel(Scope, cast<DbgLabel>(Entity.get()));
   }
@@ -1081,16 +1161,8 @@ void DwarfCompileUnit::addGlobalTypeUnitType(const DIType *Ty,
   GlobalTypes.insert(std::make_pair(std::move(FullName), &getUnitDie()));
 }
 
-/// addVariableAddress - Add DW_AT_location attribute for a
-/// DbgVariable based on provided MachineLocation.
 void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die,
                                           MachineLocation Location) {
-  // addBlockByrefAddress is obsolete and will be removed soon.
-  // The clang frontend always generates block byref variables with a
-  // complex expression that encodes exactly what addBlockByrefAddress
-  // would do.
-  assert((!DV.isBlockByrefVariable() || DV.hasComplexAddress()) &&
-         "block byref variable without a complex expression");
   if (DV.hasComplexAddress())
     addComplexAddress(DV, Die, dwarf::DW_AT_location, Location);
   else
@@ -1133,7 +1205,7 @@ void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die,
 
   if (DIExpr->isEntryValue()) {
     DwarfExpr.setEntryValueFlag();
-    DwarfExpr.addEntryValueExpression(Cursor);
+    DwarfExpr.beginEntryValueExpression(Cursor);
   }
 
   const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index ea980dfda17e..1b7ea2673ac0 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -227,12 +227,35 @@ public:
 
   void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
 
+  /// This takes a DWARF 5 tag and returns it or a GNU analog.
+  dwarf::Tag getDwarf5OrGNUTag(dwarf::Tag Tag) const;
+
+  /// This takes a DWARF 5 attribute and returns it or a GNU analog.
+  dwarf::Attribute getDwarf5OrGNUAttr(dwarf::Attribute Attr) const;
+
+  /// This takes a DWARF 5 location atom and either returns it or a GNU analog.
+  dwarf::LocationAtom getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const;
+
   /// Construct a call site entry DIE describing a call within \p Scope to a
-  /// callee described by \p CalleeSP. \p IsTail specifies whether the call is
-  /// a tail call. \p PCOffset must be non-zero for non-tail calls or be the
+  /// callee described by \p CalleeSP.
+  /// \p IsTail specifies whether the call is a tail call.
+  /// \p PCAddr (used for GDB + DWARF 4 tuning) points to the PC value after
+  /// the call instruction.
+  /// \p PCOffset (used for cases other than GDB + DWARF 4 tuning) must be
+  /// non-zero for non-tail calls (in the case of non-gdb tuning, since for
+  /// GDB + DWARF 5 tuning we still generate PC info for tail calls) or be the
   /// function-local offset to PC value after the call instruction.
-  DIE &constructCallSiteEntryDIE(DIE &ScopeDIE, const DISubprogram &CalleeSP,
-                                 bool IsTail, const MCExpr *PCOffset);
+  /// \p CallReg is a register location for an indirect call. For direct calls
+  /// the \p CallReg is set to 0.
+  DIE &constructCallSiteEntryDIE(DIE &ScopeDIE, const DISubprogram *CalleeSP,
+                                 bool IsTail, const MCSymbol *PCAddr,
+                                 const MCExpr *PCOffset, unsigned CallReg);
+  /// Construct call site parameter DIEs for the \p CallSiteDIE. The \p Params
+  /// were collected by the \ref collectCallSiteParameters.
+  /// Note: The order of parameters does not matter, since debuggers recognize
+  ///       call site parameters by the DW_AT_location attribute.
+  void constructCallSiteParmEntryDIEs(DIE &CallSiteDIE,
+                                      SmallVector<DbgCallSiteParam, 4> &Params);
 
   /// Construct import_module DIE.
   DIE *constructImportedEntityDIE(const DIImportedEntity *Module);
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 71bb2b0858cc..c505e77e5acd 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -26,6 +26,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/Dwarf.h"
@@ -39,6 +40,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/DebugInfo/DWARF/DWARFExpression.h"
@@ -83,6 +85,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "dwarfdebug"
 
+STATISTIC(NumCSParams, "Number of dbg call site params created");
+
 static cl::opt<bool>
 DisableDebugInfoPrinting("disable-debug-info-print", cl::Hidden,
                          cl::desc("Disable debug info printing"));
@@ -166,26 +170,26 @@ static const char *const DbgTimerDescription = "DWARF Debug Writer";
 static constexpr unsigned ULEB128PadSize = 4;
 
 void DebugLocDwarfExpression::emitOp(uint8_t Op, const char *Comment) {
-  BS.EmitInt8(
+  getActiveStreamer().EmitInt8(
       Op, Comment ? Twine(Comment) + " " + dwarf::OperationEncodingString(Op)
                   : dwarf::OperationEncodingString(Op));
 }
 
 void DebugLocDwarfExpression::emitSigned(int64_t Value) {
-  BS.EmitSLEB128(Value, Twine(Value));
+  getActiveStreamer().EmitSLEB128(Value, Twine(Value));
 }
 
 void DebugLocDwarfExpression::emitUnsigned(uint64_t Value) {
-  BS.EmitULEB128(Value, Twine(Value));
+  getActiveStreamer().EmitULEB128(Value, Twine(Value));
 }
 
 void DebugLocDwarfExpression::emitData1(uint8_t Value) {
-  BS.EmitInt8(Value, Twine(Value));
+  getActiveStreamer().EmitInt8(Value, Twine(Value));
 }
 
 void DebugLocDwarfExpression::emitBaseTypeRef(uint64_t Idx) {
   assert(Idx < (1ULL << (ULEB128PadSize * 7)) && "Idx wont fit");
-  BS.EmitULEB128(Idx, Twine(Idx), ULEB128PadSize);
+  getActiveStreamer().EmitULEB128(Idx, Twine(Idx), ULEB128PadSize);
 }
 
 bool DebugLocDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI,
@@ -194,54 +198,34 @@ bool DebugLocDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI,
   return false;
 }
 
-bool DbgVariable::isBlockByrefVariable() const {
-  assert(getVariable() && "Invalid complex DbgVariable!");
-  return getVariable()->getType()->isBlockByrefStruct();
+void DebugLocDwarfExpression::enableTemporaryBuffer() {
+  assert(!IsBuffering && "Already buffering?");
+  if (!TmpBuf)
+    TmpBuf = std::make_unique<TempBuffer>(OutBS.GenerateComments);
+  IsBuffering = true;
 }
 
-const DIType *DbgVariable::getType() const {
-  DIType *Ty = getVariable()->getType();
-  // FIXME: isBlockByrefVariable should be reformulated in terms of complex
-  // addresses instead.
-  if (Ty->isBlockByrefStruct()) {
-    /* Byref variables, in Blocks, are declared by the programmer as
-       "SomeType VarName;", but the compiler creates a
-       __Block_byref_x_VarName struct, and gives the variable VarName
-       either the struct, or a pointer to the struct, as its type.  This
-       is necessary for various behind-the-scenes things the compiler
-       needs to do with by-reference variables in blocks.
-
-       However, as far as the original *programmer* is concerned, the
-       variable should still have type 'SomeType', as originally declared.
-
-       The following function dives into the __Block_byref_x_VarName
-       struct to find the original type of the variable.  This will be
-       passed back to the code generating the type for the Debug
-       Information Entry for the variable 'VarName'.  'VarName' will then
-       have the original type 'SomeType' in its debug information.
-
-       The original type 'SomeType' will be the type of the field named
-       'VarName' inside the __Block_byref_x_VarName struct.
-
-       NOTE: In order for this to not completely fail on the debugger
-       side, the Debug Information Entry for the variable VarName needs to
-       have a DW_AT_location that tells the debugger how to unwind through
-       the pointers and __Block_byref_x_VarName struct to find the actual
-       value of the variable.  The function addBlockByrefType does this.  */
-    DIType *subType = Ty;
-    uint16_t tag = Ty->getTag();
-
-    if (tag == dwarf::DW_TAG_pointer_type)
-      subType = cast<DIDerivedType>(Ty)->getBaseType();
-
-    auto Elements = cast<DICompositeType>(subType)->getElements();
-    for (unsigned i = 0, N = Elements.size(); i < N; ++i) {
-      auto *DT = cast<DIDerivedType>(Elements[i]);
-      if (getName() == DT->getName())
-        return DT->getBaseType();
-    }
+void DebugLocDwarfExpression::disableTemporaryBuffer() { IsBuffering = false; }
+
+unsigned DebugLocDwarfExpression::getTemporaryBufferSize() {
+  return TmpBuf ? TmpBuf->Bytes.size() : 0;
+}
+
+void DebugLocDwarfExpression::commitTemporaryBuffer() {
+  if (!TmpBuf)
+    return;
+  for (auto Byte : enumerate(TmpBuf->Bytes)) {
+    const char *Comment = (Byte.index() < TmpBuf->Comments.size())
+                              ? TmpBuf->Comments[Byte.index()].c_str()
+                              : "";
+    OutBS.EmitInt8(Byte.value(), Comment);
   }
-  return Ty;
+  TmpBuf->Bytes.clear();
+  TmpBuf->Comments.clear();
+}
+
+const DIType *DbgVariable::getType() const {
+  return getVariable()->getType();
 }
 
 /// Get .debug_loc entry for the instruction range starting at MI.
@@ -275,7 +259,7 @@ void DbgVariable::initializeDbgValue(const MachineInstr *DbgValue) {
   assert(getInlinedAt() == DbgValue->getDebugLoc()->getInlinedAt() &&
          "Wrong inlined-at");
 
-  ValueLoc = llvm::make_unique<DbgValueLoc>(getDebugLocValue(DbgValue));
+  ValueLoc = std::make_unique<DbgValueLoc>(getDebugLocValue(DbgValue));
   if (auto *E = DbgValue->getDebugExpression())
     if (E->getNumElements())
       FrameIndexExprs.push_back({0, E});
@@ -551,6 +535,157 @@ void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU,
   }
 }
 
+/// Try to interpret values loaded into registers that forward parameters
+/// for \p CallMI. Store parameters with interpreted value into \p Params.
+static void collectCallSiteParameters(const MachineInstr *CallMI,
+                                      ParamSet &Params) {
+  auto *MF = CallMI->getMF();
+  auto CalleesMap = MF->getCallSitesInfo();
+  auto CallFwdRegsInfo = CalleesMap.find(CallMI);
+
+  // There is no information for the call instruction.
+  if (CallFwdRegsInfo == CalleesMap.end())
+    return;
+
+  auto *MBB = CallMI->getParent();
+  const auto &TRI = MF->getSubtarget().getRegisterInfo();
+  const auto &TII = MF->getSubtarget().getInstrInfo();
+  const auto &TLI = MF->getSubtarget().getTargetLowering();
+
+  // Skip the call instruction.
+  auto I = std::next(CallMI->getReverseIterator());
+
+  DenseSet<unsigned> ForwardedRegWorklist;
+  // Add all the forwarding registers into the ForwardedRegWorklist.
+  for (auto ArgReg : CallFwdRegsInfo->second) {
+    bool InsertedReg = ForwardedRegWorklist.insert(ArgReg.Reg).second;
+    assert(InsertedReg && "Single register used to forward two arguments?");
+    (void)InsertedReg;
+  }
+
+  // We erase, from the ForwardedRegWorklist, those forwarding registers for
+  // which we successfully describe a loaded value (by using
+  // the describeLoadedValue()). For those remaining arguments in the working
+  // list, for which we do not describe a loaded value by
+  // the describeLoadedValue(), we try to generate an entry value expression
+  // for their call site value desctipion, if the call is within the entry MBB.
+  // The RegsForEntryValues maps a forwarding register into the register holding
+  // the entry value.
+  // TODO: Handle situations when call site parameter value can be described
+  // as the entry value within basic blocks other then the first one.
+  bool ShouldTryEmitEntryVals = MBB->getIterator() == MF->begin();
+  DenseMap<unsigned, unsigned> RegsForEntryValues;
+
+  // If the MI is an instruction defining one or more parameters' forwarding
+  // registers, add those defines. We can currently only describe forwarded
+  // registers that are explicitly defined, but keep track of implicit defines
+  // also to remove those registers from the work list.
+  auto getForwardingRegsDefinedByMI = [&](const MachineInstr &MI,
+                                          SmallVectorImpl<unsigned> &Explicit,
+                                          SmallVectorImpl<unsigned> &Implicit) {
+    if (MI.isDebugInstr())
+      return;
+
+    for (const MachineOperand &MO : MI.operands()) {
+      if (MO.isReg() && MO.isDef() &&
+          Register::isPhysicalRegister(MO.getReg())) {
+        for (auto FwdReg : ForwardedRegWorklist) {
+          if (TRI->regsOverlap(FwdReg, MO.getReg())) {
+            if (MO.isImplicit())
+              Implicit.push_back(FwdReg);
+            else
+              Explicit.push_back(FwdReg);
+            break;
+          }
+        }
+      }
+    }
+  };
+
+  auto finishCallSiteParam = [&](DbgValueLoc DbgLocVal, unsigned Reg) {
+    unsigned FwdReg = Reg;
+    if (ShouldTryEmitEntryVals) {
+      auto EntryValReg = RegsForEntryValues.find(Reg);
+      if (EntryValReg != RegsForEntryValues.end())
+        FwdReg = EntryValReg->second;
+    }
+
+    DbgCallSiteParam CSParm(FwdReg, DbgLocVal);
+    Params.push_back(CSParm);
+    ++NumCSParams;
+  };
+
+  // Search for a loading value in forwaring registers.
+  for (; I != MBB->rend(); ++I) {
+    // If the next instruction is a call we can not interpret parameter's
+    // forwarding registers or we finished the interpretation of all parameters.
+    if (I->isCall())
+      return;
+
+    if (ForwardedRegWorklist.empty())
+      return;
+
+    SmallVector<unsigned, 4> ExplicitFwdRegDefs;
+    SmallVector<unsigned, 4> ImplicitFwdRegDefs;
+    getForwardingRegsDefinedByMI(*I, ExplicitFwdRegDefs, ImplicitFwdRegDefs);
+    if (ExplicitFwdRegDefs.empty() && ImplicitFwdRegDefs.empty())
+      continue;
+
+    // If the MI clobbers more then one forwarding register we must remove
+    // all of them from the working list.
+    for (auto Reg : concat<unsigned>(ExplicitFwdRegDefs, ImplicitFwdRegDefs))
+      ForwardedRegWorklist.erase(Reg);
+
+    // The describeLoadedValue() hook currently does not have any information
+    // about which register it should describe in case of multiple defines, so
+    // for now we only handle instructions where a forwarded register is (at
+    // least partially) defined by the instruction's single explicit define.
+    if (I->getNumExplicitDefs() != 1 || ExplicitFwdRegDefs.empty())
+      continue;
+    unsigned Reg = ExplicitFwdRegDefs[0];
+
+    if (auto ParamValue = TII->describeLoadedValue(*I)) {
+      if (ParamValue->first.isImm()) {
+        int64_t Val = ParamValue->first.getImm();
+        DbgValueLoc DbgLocVal(ParamValue->second, Val);
+        finishCallSiteParam(DbgLocVal, Reg);
+      } else if (ParamValue->first.isReg()) {
+        Register RegLoc = ParamValue->first.getReg();
+        unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
+        Register FP = TRI->getFrameRegister(*MF);
+        bool IsSPorFP = (RegLoc == SP) || (RegLoc == FP);
+        if (TRI->isCalleeSavedPhysReg(RegLoc, *MF) || IsSPorFP) {
+          DbgValueLoc DbgLocVal(ParamValue->second,
+                                MachineLocation(RegLoc,
+                                                /*IsIndirect=*/IsSPorFP));
+          finishCallSiteParam(DbgLocVal, Reg);
+        } else if (ShouldTryEmitEntryVals) {
+          ForwardedRegWorklist.insert(RegLoc);
+          RegsForEntryValues[RegLoc] = Reg;
+        }
+      }
+    }
+  }
+
+  // Emit the call site parameter's value as an entry value.
+  if (ShouldTryEmitEntryVals) {
+    // Create an expression where the register's entry value is used.
+    DIExpression *EntryExpr = DIExpression::get(
+        MF->getFunction().getContext(), {dwarf::DW_OP_LLVM_entry_value, 1});
+    for (auto RegEntry : ForwardedRegWorklist) {
+      unsigned FwdReg = RegEntry;
+      auto EntryValReg = RegsForEntryValues.find(RegEntry);
+        if (EntryValReg != RegsForEntryValues.end())
+          FwdReg = EntryValReg->second;
+
+      DbgValueLoc DbgLocVal(EntryExpr, MachineLocation(RegEntry));
+      DbgCallSiteParam CSParm(FwdReg, DbgLocVal);
+      Params.push_back(CSParm);
+      ++NumCSParams;
+    }
+  }
+}
+
 void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
                                             DwarfCompileUnit &CU, DIE &ScopeDIE,
                                             const MachineFunction &MF) {
@@ -563,10 +698,11 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
   // for both tail and non-tail calls. Don't use DW_AT_call_all_source_calls
   // because one of its requirements is not met: call site entries for
   // optimized-out calls are elided.
-  CU.addFlag(ScopeDIE, dwarf::DW_AT_call_all_calls);
+  CU.addFlag(ScopeDIE, CU.getDwarf5OrGNUAttr(dwarf::DW_AT_call_all_calls));
 
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   assert(TII && "TargetInstrInfo not found: cannot label tail calls");
+  bool ApplyGNUExtensions = getDwarfVersion() == 4 && tuneForGDB();
 
   // Emit call site entries for each call or tail call in the function.
   for (const MachineBasicBlock &MBB : MF) {
@@ -581,30 +717,66 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
         return;
 
       // If this is a direct call, find the callee's subprogram.
+      // In the case of an indirect call find the register that holds
+      // the callee.
       const MachineOperand &CalleeOp = MI.getOperand(0);
-      if (!CalleeOp.isGlobal())
-        continue;
-      const Function *CalleeDecl = dyn_cast<Function>(CalleeOp.getGlobal());
-      if (!CalleeDecl || !CalleeDecl->getSubprogram())
+      if (!CalleeOp.isGlobal() && !CalleeOp.isReg())
         continue;
 
+      unsigned CallReg = 0;
+      const DISubprogram *CalleeSP = nullptr;
+      const Function *CalleeDecl = nullptr;
+      if (CalleeOp.isReg()) {
+        CallReg = CalleeOp.getReg();
+        if (!CallReg)
+          continue;
+      } else {
+        CalleeDecl = dyn_cast<Function>(CalleeOp.getGlobal());
+        if (!CalleeDecl || !CalleeDecl->getSubprogram())
+          continue;
+        CalleeSP = CalleeDecl->getSubprogram();
+      }
+
       // TODO: Omit call site entries for runtime calls (objc_msgSend, etc).
-      // TODO: Add support for indirect calls.
 
       bool IsTail = TII->isTailCall(MI);
 
-      // For tail calls, no return PC information is needed. For regular calls,
-      // the return PC is needed to disambiguate paths in the call graph which
-      // could lead to some target function.
+      // For tail calls, for non-gdb tuning, no return PC information is needed.
+      // For regular calls (and tail calls in GDB tuning), the return PC
+      // is needed to disambiguate paths in the call graph which could lead to
+      // some target function.
       const MCExpr *PCOffset =
-          IsTail ? nullptr : getFunctionLocalOffsetAfterInsn(&MI);
+          (IsTail && !tuneForGDB()) ? nullptr
+                                    : getFunctionLocalOffsetAfterInsn(&MI);
+
+      // Address of a call-like instruction for a normal call or a jump-like
+      // instruction for a tail call. This is needed for GDB + DWARF 4 tuning.
+      const MCSymbol *PCAddr =
+          ApplyGNUExtensions ? const_cast<MCSymbol*>(getLabelAfterInsn(&MI))
+                             : nullptr;
+
+      assert((IsTail || PCOffset || PCAddr) &&
+             "Call without return PC information");
 
-      assert((IsTail || PCOffset) && "Call without return PC information");
       LLVM_DEBUG(dbgs() << "CallSiteEntry: " << MF.getName() << " -> "
-                        << CalleeDecl->getName() << (IsTail ? " [tail]" : "")
-                        << "\n");
-      CU.constructCallSiteEntryDIE(ScopeDIE, *CalleeDecl->getSubprogram(),
-                                   IsTail, PCOffset);
+                        << (CalleeDecl ? CalleeDecl->getName()
+                                       : StringRef(MF.getSubtarget()
+                                                       .getRegisterInfo()
+                                                       ->getName(CallReg)))
+                        << (IsTail ? " [IsTail]" : "") << "\n");
+
+      DIE &CallSiteDIE =
+            CU.constructCallSiteEntryDIE(ScopeDIE, CalleeSP, IsTail, PCAddr,
+                                         PCOffset, CallReg);
+
+      // GDB and LLDB support call site parameter debug info.
+      if (Asm->TM.Options.EnableDebugEntryValues &&
+          (tuneForGDB() || tuneForLLDB())) {
+        ParamSet Params;
+        // Try to interpret values of call site parameters.
+        collectCallSiteParameters(&MI, Params);
+        CU.constructCallSiteParmEntryDIEs(CallSiteDIE, Params);
+      }
     }
   }
 }
@@ -680,7 +852,7 @@ DwarfDebug::getOrCreateDwarfCompileUnit(const DICompileUnit *DIUnit) {
 
   CompilationDir = DIUnit->getDirectory();
 
-  auto OwnedUnit = llvm::make_unique<DwarfCompileUnit>(
+  auto OwnedUnit = std::make_unique<DwarfCompileUnit>(
       InfoHolder.getUnits().size(), DIUnit, Asm, this, &InfoHolder);
   DwarfCompileUnit &NewCU = *OwnedUnit;
   InfoHolder.addUnit(std::move(OwnedUnit));
@@ -793,8 +965,6 @@ void DwarfDebug::beginModule() {
     DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
     Holder.setRnglistsTableBaseSym(
         Asm->createTempSymbol("rnglists_table_base"));
-    Holder.setLoclistsTableBaseSym(
-        Asm->createTempSymbol("loclists_table_base"));
 
     if (useSplitDwarf())
       InfoHolder.setRnglistsTableBaseSym(
@@ -907,7 +1077,7 @@ void DwarfDebug::finalizeModuleInfo() {
     // If we're splitting the dwarf out now that we've got the entire
     // CU then add the dwo id to it.
     auto *SkCU = TheCU.getSkeleton();
-    if (useSplitDwarf() && !empty(TheCU.getUnitDie().children())) {
+    if (useSplitDwarf() && !TheCU.getUnitDie().children().empty()) {
       finishUnitAttributes(TheCU.getCUNode(), TheCU);
       TheCU.addString(TheCU.getUnitDie(), dwarf::DW_AT_GNU_dwo_name,
                       Asm->TM.Options.MCOptions.SplitDwarfFile);
@@ -951,7 +1121,7 @@ void DwarfDebug::finalizeModuleInfo() {
         // 2.17.3).
         U.addUInt(U.getUnitDie(), dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, 0);
       else
-        U.setBaseAddress(TheCU.getRanges().front().getStart());
+        U.setBaseAddress(TheCU.getRanges().front().Begin);
       U.attachRangesOrLowHighPC(U.getUnitDie(), TheCU.takeRanges());
     }
 
@@ -959,15 +1129,19 @@ void DwarfDebug::finalizeModuleInfo() {
     // is a bit pessimistic under LTO.
     if (!AddrPool.isEmpty() &&
         (getDwarfVersion() >= 5 ||
-         (SkCU && !empty(TheCU.getUnitDie().children()))))
+         (SkCU && !TheCU.getUnitDie().children().empty())))
       U.addAddrTableBase();
 
     if (getDwarfVersion() >= 5) {
       if (U.hasRangeLists())
         U.addRnglistsBase();
 
-      if (!DebugLocs.getLists().empty() && !useSplitDwarf())
-        U.addLoclistsBase();
+      if (!DebugLocs.getLists().empty() && !useSplitDwarf()) {
+        DebugLocs.setSym(Asm->createTempSymbol("loclists_table_base"));
+        U.addSectionLabel(U.getUnitDie(), dwarf::DW_AT_loclists_base,
+                          DebugLocs.getSym(),
+                          TLOF.getDwarfLoclistsSection()->getBeginSymbol());
+      }
     }
 
     auto *CUNode = cast<DICompileUnit>(P.first);
@@ -1105,7 +1279,7 @@ void DwarfDebug::collectVariableInfoFromMFTable(
       continue;
 
     ensureAbstractEntityIsCreatedIfScoped(TheCU, Var.first, Scope->getScopeNode());
-    auto RegVar = llvm::make_unique<DbgVariable>(
+    auto RegVar = std::make_unique<DbgVariable>(
                     cast<DILocalVariable>(Var.first), Var.second);
     RegVar->initializeMMI(VI.Expr, VI.Slot);
     if (DbgVariable *DbgVar = MFVars.lookup(Var))
@@ -1316,13 +1490,13 @@ DbgEntity *DwarfDebug::createConcreteEntity(DwarfCompileUnit &TheCU,
   ensureAbstractEntityIsCreatedIfScoped(TheCU, Node, Scope.getScopeNode());
   if (isa<const DILocalVariable>(Node)) {
     ConcreteEntities.push_back(
-        llvm::make_unique<DbgVariable>(cast<const DILocalVariable>(Node),
+        std::make_unique<DbgVariable>(cast<const DILocalVariable>(Node),
                                        Location));
     InfoHolder.addScopeVariable(&Scope,
         cast<DbgVariable>(ConcreteEntities.back().get()));
   } else if (isa<const DILabel>(Node)) {
     ConcreteEntities.push_back(
-        llvm::make_unique<DbgLabel>(cast<const DILabel>(Node),
+        std::make_unique<DbgLabel>(cast<const DILabel>(Node),
                                     Location, Sym));
     InfoHolder.addScopeLabel(&Scope,
         cast<DbgLabel>(ConcreteEntities.back().get()));
@@ -1419,11 +1593,14 @@ void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU,
 
     LexicalScope *Scope = nullptr;
     const DILabel *Label = cast<DILabel>(IL.first);
+    // The scope could have an extra lexical block file.
+    const DILocalScope *LocalScope =
+        Label->getScope()->getNonLexicalBlockFileScope();
     // Get inlined DILocation if it is inlined label.
     if (const DILocation *IA = IL.second)
-      Scope = LScopes.findInlinedScope(Label->getScope(), IA);
+      Scope = LScopes.findInlinedScope(LocalScope, IA);
     else
-      Scope = LScopes.findLexicalScope(Label->getScope());
+      Scope = LScopes.findLexicalScope(LocalScope);
     // If label scope is not found then skip this label.
     if (!Scope)
       continue;
@@ -1607,6 +1784,9 @@ void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) {
   if (SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug)
     return;
 
+  SectionLabels.insert(std::make_pair(&Asm->getFunctionBegin()->getSection(),
+                                      Asm->getFunctionBegin()));
+
   DwarfCompileUnit &CU = getOrCreateDwarfCompileUnit(SP->getUnit());
 
   // Set DwarfDwarfCompileUnitID in MCContext to the Compile Unit this function
@@ -1654,7 +1834,7 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
   collectEntityInfo(TheCU, SP, Processed);
 
   // Add the range of this function to the list of ranges for the CU.
-  TheCU.addRange(RangeSpan(Asm->getFunctionBegin(), Asm->getFunctionEnd()));
+  TheCU.addRange({Asm->getFunctionBegin(), Asm->getFunctionEnd()});
 
   // Under -gmlt, skip building the subprogram if there are no inlined
   // subroutines inside it. But with -fdebug-info-for-profiling, the subprogram
@@ -1836,9 +2016,10 @@ static dwarf::PubIndexEntryDescriptor computeIndexValue(DwarfUnit *CU,
   case dwarf::DW_TAG_union_type:
   case dwarf::DW_TAG_enumeration_type:
     return dwarf::PubIndexEntryDescriptor(
-        dwarf::GIEK_TYPE, CU->getLanguage() != dwarf::DW_LANG_C_plus_plus
-                              ? dwarf::GIEL_STATIC
-                              : dwarf::GIEL_EXTERNAL);
+        dwarf::GIEK_TYPE,
+        dwarf::isCPlusPlus((dwarf::SourceLanguage)CU->getLanguage())
+            ? dwarf::GIEL_EXTERNAL
+            : dwarf::GIEL_STATIC);
   case dwarf::DW_TAG_typedef:
   case dwarf::DW_TAG_base_type:
   case dwarf::DW_TAG_subrange_type:
@@ -1967,7 +2148,7 @@ void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
   DWARFExpression Expr(Data, getDwarfVersion(), PtrSize);
 
   using Encoding = DWARFExpression::Operation::Encoding;
-  uint32_t Offset = 0;
+  uint64_t Offset = 0;
   for (auto &Op : Expr) {
     assert(Op.getCode() != dwarf::DW_OP_const_type &&
            "3 operand ops not yet supported");
@@ -1990,7 +2171,7 @@ void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
             if (Comment != End)
               Comment++;
       } else {
-        for (uint32_t J = Offset; J < Op.getOperandEndOffset(I); ++J)
+        for (uint64_t J = Offset; J < Op.getOperandEndOffset(I); ++J)
           Streamer.EmitInt8(Data.getData()[J], Comment != End ? *(Comment++) : "");
       }
       Offset = Op.getOperandEndOffset(I);
@@ -2020,7 +2201,7 @@ void DwarfDebug::emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
 
     if (DIExpr->isEntryValue()) {
       DwarfExpr.setEntryValueFlag();
-      DwarfExpr.addEntryValueExpression(Cursor);
+      DwarfExpr.beginEntryValueExpression(Cursor);
     }
 
     const TargetRegisterInfo &TRI = *AP.MF->getSubtarget().getRegisterInfo();
@@ -2083,7 +2264,7 @@ void DwarfDebug::emitDebugLocEntryLocation(const DebugLocStream::Entry &Entry,
 }
 
 // Emit the common part of the DWARF 5 range/locations list tables header.
-static void emitListsTableHeaderStart(AsmPrinter *Asm, const DwarfFile &Holder,
+static void emitListsTableHeaderStart(AsmPrinter *Asm,
                                       MCSymbol *TableStart,
                                       MCSymbol *TableEnd) {
   // Build the table header, which starts with the length field.
@@ -2108,7 +2289,7 @@ static MCSymbol *emitRnglistsTableHeader(AsmPrinter *Asm,
                                          const DwarfFile &Holder) {
   MCSymbol *TableStart = Asm->createTempSymbol("debug_rnglist_table_start");
   MCSymbol *TableEnd = Asm->createTempSymbol("debug_rnglist_table_end");
-  emitListsTableHeaderStart(Asm, Holder, TableStart, TableEnd);
+  emitListsTableHeaderStart(Asm, TableStart, TableEnd);
 
   Asm->OutStreamer->AddComment("Offset entry count");
   Asm->emitInt32(Holder.getRangeLists().size());
@@ -2125,94 +2306,147 @@ static MCSymbol *emitRnglistsTableHeader(AsmPrinter *Asm,
 // designates the end of the table for the caller to emit when the table is
 // complete.
 static MCSymbol *emitLoclistsTableHeader(AsmPrinter *Asm,
-                                         const DwarfFile &Holder) {
+                                         const DwarfDebug &DD) {
   MCSymbol *TableStart = Asm->createTempSymbol("debug_loclist_table_start");
   MCSymbol *TableEnd = Asm->createTempSymbol("debug_loclist_table_end");
-  emitListsTableHeaderStart(Asm, Holder, TableStart, TableEnd);
+  emitListsTableHeaderStart(Asm, TableStart, TableEnd);
+
+  const auto &DebugLocs = DD.getDebugLocs();
 
   // FIXME: Generate the offsets table and use DW_FORM_loclistx with the
   // DW_AT_loclists_base attribute. Until then set the number of offsets to 0.
   Asm->OutStreamer->AddComment("Offset entry count");
   Asm->emitInt32(0);
-  Asm->OutStreamer->EmitLabel(Holder.getLoclistsTableBaseSym());
+  Asm->OutStreamer->EmitLabel(DebugLocs.getSym());
 
   return TableEnd;
 }
 
-// Emit locations into the .debug_loc/.debug_rnglists section.
-void DwarfDebug::emitDebugLoc() {
-  if (DebugLocs.getLists().empty())
-    return;
+template <typename Ranges, typename PayloadEmitter>
+static void emitRangeList(
+    DwarfDebug &DD, AsmPrinter *Asm, MCSymbol *Sym, const Ranges &R,
+    const DwarfCompileUnit &CU, unsigned BaseAddressx, unsigned OffsetPair,
+    unsigned StartxLength, unsigned EndOfList,
+    StringRef (*StringifyEnum)(unsigned),
+    bool ShouldUseBaseAddress,
+    PayloadEmitter EmitPayload) {
 
-  bool IsLocLists = getDwarfVersion() >= 5;
-  MCSymbol *TableEnd = nullptr;
-  if (IsLocLists) {
-    Asm->OutStreamer->SwitchSection(
-        Asm->getObjFileLowering().getDwarfLoclistsSection());
-    TableEnd = emitLoclistsTableHeader(Asm, useSplitDwarf() ? SkeletonHolder
-                                                            : InfoHolder);
-  } else {
-    Asm->OutStreamer->SwitchSection(
-        Asm->getObjFileLowering().getDwarfLocSection());
-  }
+  auto Size = Asm->MAI->getCodePointerSize();
+  bool UseDwarf5 = DD.getDwarfVersion() >= 5;
 
-  unsigned char Size = Asm->MAI->getCodePointerSize();
-  for (const auto &List : DebugLocs.getLists()) {
-    Asm->OutStreamer->EmitLabel(List.Label);
+  // Emit our symbol so we can find the beginning of the range.
+  Asm->OutStreamer->EmitLabel(Sym);
 
-    const DwarfCompileUnit *CU = List.CU;
-    const MCSymbol *Base = CU->getBaseAddress();
-    for (const auto &Entry : DebugLocs.getEntries(List)) {
+  // Gather all the ranges that apply to the same section so they can share
+  // a base address entry.
+  MapVector<const MCSection *, std::vector<decltype(&*R.begin())>> SectionRanges;
+
+  for (const auto &Range : R)
+    SectionRanges[&Range.Begin->getSection()].push_back(&Range);
+
+  const MCSymbol *CUBase = CU.getBaseAddress();
+  bool BaseIsSet = false;
+  for (const auto &P : SectionRanges) {
+    auto *Base = CUBase;
+    if (!Base && ShouldUseBaseAddress) {
+      const MCSymbol *Begin = P.second.front()->Begin;
+      const MCSymbol *NewBase = DD.getSectionLabel(&Begin->getSection());
+      if (!UseDwarf5) {
+        Base = NewBase;
+        BaseIsSet = true;
+        Asm->OutStreamer->EmitIntValue(-1, Size);
+        Asm->OutStreamer->AddComment("  base address");
+        Asm->OutStreamer->EmitSymbolValue(Base, Size);
+      } else if (NewBase != Begin || P.second.size() > 1) {
+        // Only use a base address if
+        //  * the existing pool address doesn't match (NewBase != Begin)
+        //  * or, there's more than one entry to share the base address
+        Base = NewBase;
+        BaseIsSet = true;
+        Asm->OutStreamer->AddComment(StringifyEnum(BaseAddressx));
+        Asm->emitInt8(BaseAddressx);
+        Asm->OutStreamer->AddComment("  base address index");
+        Asm->EmitULEB128(DD.getAddressPool().getIndex(Base));
+      }
+    } else if (BaseIsSet && !UseDwarf5) {
+      BaseIsSet = false;
+      assert(!Base);
+      Asm->OutStreamer->EmitIntValue(-1, Size);
+      Asm->OutStreamer->EmitIntValue(0, Size);
+    }
+
+    for (const auto *RS : P.second) {
+      const MCSymbol *Begin = RS->Begin;
+      const MCSymbol *End = RS->End;
+      assert(Begin && "Range without a begin symbol?");
+      assert(End && "Range without an end symbol?");
       if (Base) {
-        // Set up the range. This range is relative to the entry point of the
-        // compile unit. This is a hard coded 0 for low_pc when we're emitting
-        // ranges, or the DW_AT_low_pc on the compile unit otherwise.
-        if (IsLocLists) {
-          Asm->OutStreamer->AddComment("DW_LLE_offset_pair");
-          Asm->OutStreamer->EmitIntValue(dwarf::DW_LLE_offset_pair, 1);
+        if (UseDwarf5) {
+          // Emit offset_pair when we have a base.
+          Asm->OutStreamer->AddComment(StringifyEnum(OffsetPair));
+          Asm->emitInt8(OffsetPair);
           Asm->OutStreamer->AddComment("  starting offset");
-          Asm->EmitLabelDifferenceAsULEB128(Entry.BeginSym, Base);
+          Asm->EmitLabelDifferenceAsULEB128(Begin, Base);
           Asm->OutStreamer->AddComment("  ending offset");
-          Asm->EmitLabelDifferenceAsULEB128(Entry.EndSym, Base);
+          Asm->EmitLabelDifferenceAsULEB128(End, Base);
         } else {
-          Asm->EmitLabelDifference(Entry.BeginSym, Base, Size);
-          Asm->EmitLabelDifference(Entry.EndSym, Base, Size);
+          Asm->EmitLabelDifference(Begin, Base, Size);
+          Asm->EmitLabelDifference(End, Base, Size);
         }
-
-        emitDebugLocEntryLocation(Entry, CU);
-        continue;
-      }
-
-      // We have no base address.
-      if (IsLocLists) {
-        // TODO: Use DW_LLE_base_addressx + DW_LLE_offset_pair, or
-        // DW_LLE_startx_length in case if there is only a single range.
-        // That should reduce the size of the debug data emited.
-        // For now just use the DW_LLE_startx_length for all cases.
-        Asm->OutStreamer->AddComment("DW_LLE_startx_length");
-        Asm->emitInt8(dwarf::DW_LLE_startx_length);
-        Asm->OutStreamer->AddComment("  start idx");
-        Asm->EmitULEB128(AddrPool.getIndex(Entry.BeginSym));
+      } else if (UseDwarf5) {
+        Asm->OutStreamer->AddComment(StringifyEnum(StartxLength));
+        Asm->emitInt8(StartxLength);
+        Asm->OutStreamer->AddComment("  start index");
+        Asm->EmitULEB128(DD.getAddressPool().getIndex(Begin));
         Asm->OutStreamer->AddComment("  length");
-        Asm->EmitLabelDifferenceAsULEB128(Entry.EndSym, Entry.BeginSym);
+        Asm->EmitLabelDifferenceAsULEB128(End, Begin);
       } else {
-        Asm->OutStreamer->EmitSymbolValue(Entry.BeginSym, Size);
-        Asm->OutStreamer->EmitSymbolValue(Entry.EndSym, Size);
+        Asm->OutStreamer->EmitSymbolValue(Begin, Size);
+        Asm->OutStreamer->EmitSymbolValue(End, Size);
       }
-
-      emitDebugLocEntryLocation(Entry, CU);
+      EmitPayload(*RS);
     }
+  }
 
-    if (IsLocLists) {
-      // .debug_loclists section ends with DW_LLE_end_of_list.
-      Asm->OutStreamer->AddComment("DW_LLE_end_of_list");
-      Asm->OutStreamer->EmitIntValue(dwarf::DW_LLE_end_of_list, 1);
-    } else {
-      // Terminate the .debug_loc list with two 0 values.
-      Asm->OutStreamer->EmitIntValue(0, Size);
-      Asm->OutStreamer->EmitIntValue(0, Size);
-    }
+  if (UseDwarf5) {
+    Asm->OutStreamer->AddComment(StringifyEnum(EndOfList));
+    Asm->emitInt8(EndOfList);
+  } else {
+    // Terminate the list with two 0 values.
+    Asm->OutStreamer->EmitIntValue(0, Size);
+    Asm->OutStreamer->EmitIntValue(0, Size);
   }
+}
+
+static void emitLocList(DwarfDebug &DD, AsmPrinter *Asm, const DebugLocStream::List &List) {
+  emitRangeList(
+      DD, Asm, List.Label, DD.getDebugLocs().getEntries(List), *List.CU,
+      dwarf::DW_LLE_base_addressx, dwarf::DW_LLE_offset_pair,
+      dwarf::DW_LLE_startx_length, dwarf::DW_LLE_end_of_list,
+      llvm::dwarf::LocListEncodingString,
+      /* ShouldUseBaseAddress */ true,
+      [&](const DebugLocStream::Entry &E) {
+        DD.emitDebugLocEntryLocation(E, List.CU);
+      });
+}
+
+// Emit locations into the .debug_loc/.debug_rnglists section.
+void DwarfDebug::emitDebugLoc() {
+  if (DebugLocs.getLists().empty())
+    return;
+
+  MCSymbol *TableEnd = nullptr;
+  if (getDwarfVersion() >= 5) {
+    Asm->OutStreamer->SwitchSection(
+        Asm->getObjFileLowering().getDwarfLoclistsSection());
+    TableEnd = emitLoclistsTableHeader(Asm, *this);
+  } else {
+    Asm->OutStreamer->SwitchSection(
+        Asm->getObjFileLowering().getDwarfLocSection());
+  }
+
+  for (const auto &List : DebugLocs.getLists())
+    emitLocList(*this, Asm, List);
 
   if (TableEnd)
     Asm->OutStreamer->EmitLabel(TableEnd);
@@ -2232,9 +2466,9 @@ void DwarfDebug::emitDebugLocDWO() {
       // Ideally/in v5, this could use SectionLabels to reuse existing addresses
       // in the address pool to minimize object size/relocations.
       Asm->emitInt8(dwarf::DW_LLE_startx_length);
-      unsigned idx = AddrPool.getIndex(Entry.BeginSym);
+      unsigned idx = AddrPool.getIndex(Entry.Begin);
       Asm->EmitULEB128(idx);
-      Asm->EmitLabelDifference(Entry.EndSym, Entry.BeginSym, 4);
+      Asm->EmitLabelDifference(Entry.End, Entry.Begin, 4);
 
       emitDebugLocEntryLocation(Entry, List.CU);
     }
@@ -2360,7 +2594,7 @@ void DwarfDebug::emitDebugARanges() {
 
     // 7.20 in the Dwarf specs requires the table to be aligned to a tuple.
     unsigned Padding =
-        OffsetToAlignment(sizeof(int32_t) + ContentSize, TupleSize);
+        offsetToAlignment(sizeof(int32_t) + ContentSize, Align(TupleSize));
 
     ContentSize += Padding;
     ContentSize += (List.size() + 1) * TupleSize;
@@ -2405,93 +2639,13 @@ void DwarfDebug::emitDebugARanges() {
 /// Emit a single range list. We handle both DWARF v5 and earlier.
 static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm,
                           const RangeSpanList &List) {
-
-  auto DwarfVersion = DD.getDwarfVersion();
-  // Emit our symbol so we can find the beginning of the range.
-  Asm->OutStreamer->EmitLabel(List.getSym());
-  // Gather all the ranges that apply to the same section so they can share
-  // a base address entry.
-  MapVector<const MCSection *, std::vector<const RangeSpan *>> SectionRanges;
-  // Size for our labels.
-  auto Size = Asm->MAI->getCodePointerSize();
-
-  for (const RangeSpan &Range : List.getRanges())
-    SectionRanges[&Range.getStart()->getSection()].push_back(&Range);
-
-  const DwarfCompileUnit &CU = List.getCU();
-  const MCSymbol *CUBase = CU.getBaseAddress();
-  bool BaseIsSet = false;
-  for (const auto &P : SectionRanges) {
-    // Don't bother with a base address entry if there's only one range in
-    // this section in this range list - for example ranges for a CU will
-    // usually consist of single regions from each of many sections
-    // (-ffunction-sections, or just C++ inline functions) except under LTO
-    // or optnone where there may be holes in a single CU's section
-    // contributions.
-    auto *Base = CUBase;
-    if (!Base && (P.second.size() > 1 || DwarfVersion < 5) &&
-        (CU.getCUNode()->getRangesBaseAddress() || DwarfVersion >= 5)) {
-      BaseIsSet = true;
-      // FIXME/use care: This may not be a useful base address if it's not
-      // the lowest address/range in this object.
-      Base = P.second.front()->getStart();
-      if (DwarfVersion >= 5) {
-        Base = DD.getSectionLabel(&Base->getSection());
-        Asm->OutStreamer->AddComment("DW_RLE_base_addressx");
-        Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_base_addressx, 1);
-        Asm->OutStreamer->AddComment("  base address index");
-        Asm->EmitULEB128(DD.getAddressPool().getIndex(Base));
-      } else {
-        Asm->OutStreamer->EmitIntValue(-1, Size);
-        Asm->OutStreamer->AddComment("  base address");
-        Asm->OutStreamer->EmitSymbolValue(Base, Size);
-      }
-    } else if (BaseIsSet && DwarfVersion < 5) {
-      BaseIsSet = false;
-      assert(!Base);
-      Asm->OutStreamer->EmitIntValue(-1, Size);
-      Asm->OutStreamer->EmitIntValue(0, Size);
-    }
-
-    for (const auto *RS : P.second) {
-      const MCSymbol *Begin = RS->getStart();
-      const MCSymbol *End = RS->getEnd();
-      assert(Begin && "Range without a begin symbol?");
-      assert(End && "Range without an end symbol?");
-      if (Base) {
-        if (DwarfVersion >= 5) {
-          // Emit DW_RLE_offset_pair when we have a base.
-          Asm->OutStreamer->AddComment("DW_RLE_offset_pair");
-          Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_offset_pair, 1);
-          Asm->OutStreamer->AddComment("  starting offset");
-          Asm->EmitLabelDifferenceAsULEB128(Begin, Base);
-          Asm->OutStreamer->AddComment("  ending offset");
-          Asm->EmitLabelDifferenceAsULEB128(End, Base);
-        } else {
-          Asm->EmitLabelDifference(Begin, Base, Size);
-          Asm->EmitLabelDifference(End, Base, Size);
-        }
-      } else if (DwarfVersion >= 5) {
-        Asm->OutStreamer->AddComment("DW_RLE_startx_length");
-        Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_startx_length, 1);
-        Asm->OutStreamer->AddComment("  start index");
-        Asm->EmitULEB128(DD.getAddressPool().getIndex(Begin));
-        Asm->OutStreamer->AddComment("  length");
-        Asm->EmitLabelDifferenceAsULEB128(End, Begin);
-      } else {
-        Asm->OutStreamer->EmitSymbolValue(Begin, Size);
-        Asm->OutStreamer->EmitSymbolValue(End, Size);
-      }
-    }
-  }
-  if (DwarfVersion >= 5) {
-    Asm->OutStreamer->AddComment("DW_RLE_end_of_list");
-    Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_end_of_list, 1);
-  } else {
-    // Terminate the list with two 0 values.
-    Asm->OutStreamer->EmitIntValue(0, Size);
-    Asm->OutStreamer->EmitIntValue(0, Size);
-  }
+  emitRangeList(DD, Asm, List.getSym(), List.getRanges(), List.getCU(),
+                dwarf::DW_RLE_base_addressx, dwarf::DW_RLE_offset_pair,
+                dwarf::DW_RLE_startx_length, dwarf::DW_RLE_end_of_list,
+                llvm::dwarf::RangeListEncodingString,
+                List.getCU().getCUNode()->getRangesBaseAddress() ||
+                    DD.getDwarfVersion() >= 5,
+                [](auto) {});
 }
 
 static void emitDebugRangesImpl(DwarfDebug &DD, AsmPrinter *Asm,
@@ -2637,7 +2791,7 @@ void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die,
 
 DwarfCompileUnit &DwarfDebug::constructSkeletonCU(const DwarfCompileUnit &CU) {
 
-  auto OwnedUnit = llvm::make_unique<DwarfCompileUnit>(
+  auto OwnedUnit = std::make_unique<DwarfCompileUnit>(
       CU.getUniqueID(), CU.getCUNode(), Asm, this, &SkeletonHolder);
   DwarfCompileUnit &NewCU = *OwnedUnit;
   NewCU.setSection(Asm->getObjFileLowering().getDwarfInfoSection());
@@ -2737,7 +2891,7 @@ void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
   bool TopLevelType = TypeUnitsUnderConstruction.empty();
   AddrPool.resetUsedFlag();
 
-  auto OwnedUnit = llvm::make_unique<DwarfTypeUnit>(CU, Asm, this, &InfoHolder,
+  auto OwnedUnit = std::make_unique<DwarfTypeUnit>(CU, Asm, this, &InfoHolder,
                                                     getDwoLineTable(CU));
   DwarfTypeUnit &NewTU = *OwnedUnit;
   DIE &UnitDie = NewTU.getUnitDie();
@@ -2879,10 +3033,6 @@ uint16_t DwarfDebug::getDwarfVersion() const {
   return Asm->OutStreamer->getContext().getDwarfVersion();
 }
 
-void DwarfDebug::addSectionLabel(const MCSymbol *Sym) {
-  SectionLabels.insert(std::make_pair(&Sym->getSection(), Sym));
-}
-
 const MCSymbol *DwarfDebug::getSectionLabel(const MCSection *S) {
   return SectionLabels.find(S)->second;
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 3ac474e2bdda..c8c511f67c2a 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -153,7 +153,7 @@ public:
     assert(!ValueLoc && "Already initialized?");
     assert(!Value.getExpression()->isFragment() && "Fragments not supported.");
 
-    ValueLoc = llvm::make_unique<DbgValueLoc>(Value);
+    ValueLoc = std::make_unique<DbgValueLoc>(Value);
     if (auto *E = ValueLoc->getExpression())
       if (E->getNumElements())
         FrameIndexExprs.push_back({0, E});
@@ -216,7 +216,6 @@ public:
     return !FrameIndexExprs.empty();
   }
 
-  bool isBlockByrefVariable() const;
   const DIType *getType() const;
 
   static bool classof(const DbgEntity *N) {
@@ -254,6 +253,25 @@ public:
   }
 };
 
+/// Used for tracking debug info about call site parameters.
+class DbgCallSiteParam {
+private:
+  unsigned Register; ///< Parameter register at the callee entry point.
+  DbgValueLoc Value; ///< Corresponding location for the parameter value at
+                     ///< the call site.
+public:
+  DbgCallSiteParam(unsigned Reg, DbgValueLoc Val)
+      : Register(Reg), Value(Val) {
+    assert(Reg && "Parameter register cannot be undef");
+  }
+
+  unsigned getRegister() const { return Register; }
+  DbgValueLoc getValue() const { return Value; }
+};
+
+/// Collection used for storing debug call site parameters.
+using ParamSet = SmallVector<DbgCallSiteParam, 4>;
+
 /// Helper used to pair up a symbol and its DWARF compile unit.
 struct SymbolCU {
   SymbolCU(DwarfCompileUnit *CU, const MCSymbol *Sym) : Sym(Sym), CU(CU) {}
diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index 2858afaa1cf1..1c5a244d7c5d 100644
--- a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/Register.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -97,7 +98,7 @@ void DwarfExpression::addAnd(unsigned Mask) {
 
 bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
                                     unsigned MachineReg, unsigned MaxSize) {
-  if (!TRI.isPhysicalRegister(MachineReg)) {
+  if (!llvm::Register::isPhysicalRegister(MachineReg)) {
     if (isFrameRegister(TRI, MachineReg)) {
       DwarfRegs.push_back({-1, 0, nullptr});
       return true;
@@ -241,15 +242,22 @@ bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
     return false;
   }
 
-  // Handle simple register locations.
-  if (!isMemoryLocation() && !HasComplexExpression) {
+  // Handle simple register locations. If we are supposed to emit
+  // a call site parameter expression and if that expression is just a register
+  // location, emit it with addBReg and offset 0, because we should emit a DWARF
+  // expression representing a value, rather than a location.
+  if (!isMemoryLocation() && !HasComplexExpression &&
+      (!isParameterValue() || isEntryValue())) {
     for (auto &Reg : DwarfRegs) {
       if (Reg.DwarfRegNo >= 0)
         addReg(Reg.DwarfRegNo, Reg.Comment);
       addOpPiece(Reg.Size);
     }
 
-    if (isEntryValue() && DwarfVersion >= 4)
+    if (isEntryValue())
+      finalizeEntryValue();
+
+    if (isEntryValue() && !isParameterValue() && DwarfVersion >= 4)
       emitOp(dwarf::DW_OP_stack_value);
 
     DwarfRegs.clear();
@@ -275,19 +283,27 @@ bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
   // Pattern-match combinations for which more efficient representations exist.
   // [Reg, DW_OP_plus_uconst, Offset] --> [DW_OP_breg, Offset].
   if (Op && (Op->getOp() == dwarf::DW_OP_plus_uconst)) {
-    SignedOffset = Op->getArg(0);
-    ExprCursor.take();
+    uint64_t Offset = Op->getArg(0);
+    uint64_t IntMax = static_cast<uint64_t>(std::numeric_limits<int>::max());
+    if (Offset <= IntMax) {
+      SignedOffset = Offset;
+      ExprCursor.take();
+    }
   }
 
   // [Reg, DW_OP_constu, Offset, DW_OP_plus]  --> [DW_OP_breg, Offset]
   // [Reg, DW_OP_constu, Offset, DW_OP_minus] --> [DW_OP_breg,-Offset]
   // If Reg is a subregister we need to mask it out before subtracting.
   if (Op && Op->getOp() == dwarf::DW_OP_constu) {
+    uint64_t Offset = Op->getArg(0);
+    uint64_t IntMax = static_cast<uint64_t>(std::numeric_limits<int>::max());
     auto N = ExprCursor.peekNext();
-    if (N && (N->getOp() == dwarf::DW_OP_plus ||
-             (N->getOp() == dwarf::DW_OP_minus && !SubRegisterSizeInBits))) {
-      int Offset = Op->getArg(0);
-      SignedOffset = (N->getOp() == dwarf::DW_OP_minus) ? -Offset : Offset;
+    if (N && N->getOp() == dwarf::DW_OP_plus && Offset <= IntMax) {
+      SignedOffset = Offset;
+      ExprCursor.consume(2);
+    } else if (N && N->getOp() == dwarf::DW_OP_minus &&
+               !SubRegisterSizeInBits && Offset <= IntMax + 1) {
+      SignedOffset = -static_cast<int64_t>(Offset);
       ExprCursor.consume(2);
     }
   }
@@ -300,17 +316,34 @@ bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
   return true;
 }
 
-void DwarfExpression::addEntryValueExpression(DIExpressionCursor &ExprCursor) {
+void DwarfExpression::beginEntryValueExpression(
+    DIExpressionCursor &ExprCursor) {
   auto Op = ExprCursor.take();
-  assert(Op && Op->getOp() == dwarf::DW_OP_entry_value);
+  (void)Op;
+  assert(Op && Op->getOp() == dwarf::DW_OP_LLVM_entry_value);
   assert(!isMemoryLocation() &&
          "We don't support entry values of memory locations yet");
+  assert(!IsEmittingEntryValue && "Already emitting entry value?");
+  assert(Op->getArg(0) == 1 &&
+         "Can currently only emit entry values covering a single operation");
 
-  if (DwarfVersion >= 5)
-    emitOp(dwarf::DW_OP_entry_value);
-  else
-    emitOp(dwarf::DW_OP_GNU_entry_value);
-  emitUnsigned(Op->getArg(0));
+  emitOp(CU.getDwarf5OrGNULocationAtom(dwarf::DW_OP_entry_value));
+  IsEmittingEntryValue = true;
+  enableTemporaryBuffer();
+}
+
+void DwarfExpression::finalizeEntryValue() {
+  assert(IsEmittingEntryValue && "Entry value not open?");
+  disableTemporaryBuffer();
+
+  // Emit the entry value's size operand.
+  unsigned Size = getTemporaryBufferSize();
+  emitUnsigned(Size);
+
+  // Emit the entry value's DWARF block operand.
+  commitTemporaryBuffer();
+
+  IsEmittingEntryValue = false;
 }
 
 /// Assuming a well-formed expression, match "DW_OP_deref* DW_OP_LLVM_fragment?".
@@ -340,7 +373,17 @@ void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor,
 
   while (ExprCursor) {
     auto Op = ExprCursor.take();
-    switch (Op->getOp()) {
+    uint64_t OpNum = Op->getOp();
+
+    if (OpNum >= dwarf::DW_OP_reg0 && OpNum <= dwarf::DW_OP_reg31) {
+      emitOp(OpNum);
+      continue;
+    } else if (OpNum >= dwarf::DW_OP_breg0 && OpNum <= dwarf::DW_OP_breg31) {
+      addBReg(OpNum - dwarf::DW_OP_breg0, Op->getArg(0));
+      continue;
+    }
+
+    switch (OpNum) {
     case dwarf::DW_OP_LLVM_fragment: {
       unsigned SizeInBits = Op->getArg(1);
       unsigned FragmentOffset = Op->getArg(0);
@@ -389,10 +432,13 @@ void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor,
     case dwarf::DW_OP_lit0:
     case dwarf::DW_OP_not:
     case dwarf::DW_OP_dup:
-      emitOp(Op->getOp());
+      emitOp(OpNum);
       break;
     case dwarf::DW_OP_deref:
       assert(!isRegisterLocation());
+      // For more detailed explanation see llvm.org/PR43343.
+      assert(!isParameterValue() && "Parameter entry values should not be "
+                                    "dereferenced due to safety reasons.");
       if (!isMemoryLocation() && ::isMemoryLocation(ExprCursor))
         // Turning this into a memory location description makes the deref
         // implicit.
@@ -458,12 +504,21 @@ void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor,
     case dwarf::DW_OP_LLVM_tag_offset:
       TagOffset = Op->getArg(0);
       break;
+    case dwarf::DW_OP_regx:
+      emitOp(dwarf::DW_OP_regx);
+      emitUnsigned(Op->getArg(0));
+      break;
+    case dwarf::DW_OP_bregx:
+      emitOp(dwarf::DW_OP_bregx);
+      emitUnsigned(Op->getArg(0));
+      emitSigned(Op->getArg(1));
+      break;
     default:
       llvm_unreachable("unhandled opcode found in expression");
     }
   }
 
-  if (isImplicitLocation())
+  if (isImplicitLocation() && !isParameterValue())
     // Turn this into an implicit location description.
     addStackValue();
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.h b/lib/CodeGen/AsmPrinter/DwarfExpression.h
index ec2ef6e575f7..1ad46669f9b2 100644
--- a/lib/CodeGen/AsmPrinter/DwarfExpression.h
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXPRESSION_H
 #define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXPRESSION_H
 
+#include "ByteStreamer.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
@@ -26,7 +27,6 @@ namespace llvm {
 
 class AsmPrinter;
 class APInt;
-class ByteStreamer;
 class DwarfCompileUnit;
 class DIELoc;
 class TargetRegisterInfo;
@@ -95,6 +95,13 @@ public:
 /// Base class containing the logic for constructing DWARF expressions
 /// independently of whether they are emitted into a DIE or into a .debug_loc
 /// entry.
+///
+/// Some DWARF operations, e.g. DW_OP_entry_value, need to calculate the size
+/// of a succeeding DWARF block before the latter is emitted to the output.
+/// To handle such cases, data can conditionally be emitted to a temporary
+/// buffer, which can later on be committed to the main output. The size of the
+/// temporary buffer is queryable, allowing for the size of the data to be
+/// emitted before the data is committed.
 class DwarfExpression {
 protected:
   /// Holds information about all subregisters comprising a register location.
@@ -104,6 +111,9 @@ protected:
     const char *Comment;
   };
 
+  /// Whether we are currently emitting an entry value operation.
+  bool IsEmittingEntryValue = false;
+
   DwarfCompileUnit &CU;
 
   /// The register location, if any.
@@ -120,7 +130,7 @@ protected:
   enum { Unknown = 0, Register, Memory, Implicit };
 
   /// The flags of location description being produced.
-  enum { EntryValue = 1 };
+  enum { EntryValue = 1, CallSiteParamValue };
 
   unsigned LocationKind : 3;
   unsigned LocationFlags : 2;
@@ -147,6 +157,10 @@ public:
     return LocationFlags & EntryValue;
   }
 
+  bool isParameterValue() {
+    return LocationFlags & CallSiteParamValue;
+  }
+
   Optional<uint8_t> TagOffset;
 
 protected:
@@ -174,6 +188,22 @@ protected:
 
   virtual void emitBaseTypeRef(uint64_t Idx) = 0;
 
+  /// Start emitting data to the temporary buffer. The data stored in the
+  /// temporary buffer can be committed to the main output using
+  /// commitTemporaryBuffer().
+  virtual void enableTemporaryBuffer() = 0;
+
+  /// Disable emission to the temporary buffer. This does not commit data
+  /// in the temporary buffer to the main output.
+  virtual void disableTemporaryBuffer() = 0;
+
+  /// Return the emitted size, in number of bytes, for the data stored in the
+  /// temporary buffer.
+  virtual unsigned getTemporaryBufferSize() = 0;
+
+  /// Commit the data stored in the temporary buffer to the main output.
+  virtual void commitTemporaryBuffer() = 0;
+
   /// Emit a normalized unsigned constant.
   void emitConstu(uint64_t Value);
 
@@ -233,6 +263,10 @@ protected:
   /// expression.  See PR21176 for more details.
   void addStackValue();
 
+  /// Finalize an entry value by emitting its size operand, and committing the
+  /// DWARF block which has been emitted to the temporary buffer.
+  void finalizeEntryValue();
+
   ~DwarfExpression() = default;
 
 public:
@@ -264,6 +298,11 @@ public:
     LocationFlags |= EntryValue;
   }
 
+  /// Lock this down to become a call site parameter location.
+  void setCallSiteParamValueFlag() {
+    LocationFlags |= CallSiteParamValue;
+  }
+
   /// Emit a machine register location. As an optimization this may also consume
   /// the prefix of a DwarfExpression if a more efficient representation for
   /// combining the register location and the first operation exists.
@@ -278,8 +317,11 @@ public:
                                DIExpressionCursor &Expr, unsigned MachineReg,
                                unsigned FragmentOffsetInBits = 0);
 
-  /// Emit entry value dwarf operation.
-  void addEntryValueExpression(DIExpressionCursor &ExprCursor);
+  /// Begin emission of an entry value dwarf operation. The entry value's
+  /// first operand is the size of the DWARF block (its second operand),
+  /// which needs to be calculated at time of emission, so we don't emit
+  /// any operands here.
+  void beginEntryValueExpression(DIExpressionCursor &ExprCursor);
 
   /// Emit all remaining operations in the DIExpressionCursor.
   ///
@@ -299,31 +341,62 @@ public:
 
 /// DwarfExpression implementation for .debug_loc entries.
 class DebugLocDwarfExpression final : public DwarfExpression {
-  ByteStreamer &BS;
+
+  struct TempBuffer {
+    SmallString<32> Bytes;
+    std::vector<std::string> Comments;
+    BufferByteStreamer BS;
+
+    TempBuffer(bool GenerateComments) : BS(Bytes, Comments, GenerateComments) {}
+  };
+
+  std::unique_ptr<TempBuffer> TmpBuf;
+  BufferByteStreamer &OutBS;
+  bool IsBuffering = false;
+
+  /// Return the byte streamer that currently is being emitted to.
+  ByteStreamer &getActiveStreamer() { return IsBuffering ? TmpBuf->BS : OutBS; }
 
   void emitOp(uint8_t Op, const char *Comment = nullptr) override;
   void emitSigned(int64_t Value) override;
   void emitUnsigned(uint64_t Value) override;
   void emitData1(uint8_t Value) override;
   void emitBaseTypeRef(uint64_t Idx) override;
+
+  void enableTemporaryBuffer() override;
+  void disableTemporaryBuffer() override;
+  unsigned getTemporaryBufferSize() override;
+  void commitTemporaryBuffer() override;
+
   bool isFrameRegister(const TargetRegisterInfo &TRI,
                        unsigned MachineReg) override;
-
 public:
-  DebugLocDwarfExpression(unsigned DwarfVersion, ByteStreamer &BS, DwarfCompileUnit &CU)
-      : DwarfExpression(DwarfVersion, CU), BS(BS) {}
+  DebugLocDwarfExpression(unsigned DwarfVersion, BufferByteStreamer &BS,
+                          DwarfCompileUnit &CU)
+      : DwarfExpression(DwarfVersion, CU), OutBS(BS) {}
 };
 
 /// DwarfExpression implementation for singular DW_AT_location.
 class DIEDwarfExpression final : public DwarfExpression {
-const AsmPrinter &AP;
-  DIELoc &DIE;
+  const AsmPrinter &AP;
+  DIELoc &OutDIE;
+  DIELoc TmpDIE;
+  bool IsBuffering = false;
+
+  /// Return the DIE that currently is being emitted to.
+  DIELoc &getActiveDIE() { return IsBuffering ? TmpDIE : OutDIE; }
 
   void emitOp(uint8_t Op, const char *Comment = nullptr) override;
   void emitSigned(int64_t Value) override;
   void emitUnsigned(uint64_t Value) override;
   void emitData1(uint8_t Value) override;
   void emitBaseTypeRef(uint64_t Idx) override;
+
+  void enableTemporaryBuffer() override;
+  void disableTemporaryBuffer() override;
+  unsigned getTemporaryBufferSize() override;
+  void commitTemporaryBuffer() override;
+
   bool isFrameRegister(const TargetRegisterInfo &TRI,
                        unsigned MachineReg) override;
 public:
@@ -331,7 +404,7 @@ public:
 
   DIELoc *finalize() {
     DwarfExpression::finalize();
-    return &DIE;
+    return &OutDIE;
   }
 };
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h
index 244678ce9dc1..35fa51fb24c4 100644
--- a/lib/CodeGen/AsmPrinter/DwarfFile.h
+++ b/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -32,15 +32,9 @@ class LexicalScope;
 class MCSection;
 
 // Data structure to hold a range for range lists.
-class RangeSpan {
-public:
-  RangeSpan(MCSymbol *S, MCSymbol *E) : Start(S), End(E) {}
-  const MCSymbol *getStart() const { return Start; }
-  const MCSymbol *getEnd() const { return End; }
-  void setEnd(const MCSymbol *E) { End = E; }
-
-private:
-  const MCSymbol *Start, *End;
+struct RangeSpan {
+  const MCSymbol *Begin;
+  const MCSymbol *End;
 };
 
 class RangeSpanList {
@@ -86,10 +80,6 @@ class DwarfFile {
   /// The table is shared by all units.
   MCSymbol *RnglistsTableBaseSym = nullptr;
 
-  /// DWARF v5: The symbol that designates the base of the locations list table.
-  /// The table is shared by all units.
-  MCSymbol *LoclistsTableBaseSym = nullptr;
-
   /// The variables of a lexical scope.
   struct ScopeVars {
     /// We need to sort Args by ArgNo and check for duplicates. This could also
@@ -167,9 +157,6 @@ public:
   MCSymbol *getRnglistsTableBaseSym() const { return RnglistsTableBaseSym; }
   void setRnglistsTableBaseSym(MCSymbol *Sym) { RnglistsTableBaseSym = Sym; }
 
-  MCSymbol *getLoclistsTableBaseSym() const { return LoclistsTableBaseSym; }
-  void setLoclistsTableBaseSym(MCSymbol *Sym) { LoclistsTableBaseSym = Sym; }
-
   /// \returns false if the variable was merged with a previous one.
   bool addScopeVariable(LexicalScope *LS, DbgVariable *Var);
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 991ab94b50ab..37c68c085792 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -47,31 +47,42 @@ using namespace llvm;
 #define DEBUG_TYPE "dwarfdebug"
 
 DIEDwarfExpression::DIEDwarfExpression(const AsmPrinter &AP,
-                                       DwarfCompileUnit &CU,
-                                       DIELoc &DIE)
-    : DwarfExpression(AP.getDwarfVersion(), CU), AP(AP),
-      DIE(DIE) {}
+                                       DwarfCompileUnit &CU, DIELoc &DIE)
+    : DwarfExpression(AP.getDwarfVersion(), CU), AP(AP), OutDIE(DIE) {}
 
 void DIEDwarfExpression::emitOp(uint8_t Op, const char* Comment) {
-  CU.addUInt(DIE, dwarf::DW_FORM_data1, Op);
+  CU.addUInt(getActiveDIE(), dwarf::DW_FORM_data1, Op);
 }
 
 void DIEDwarfExpression::emitSigned(int64_t Value) {
-  CU.addSInt(DIE, dwarf::DW_FORM_sdata, Value);
+  CU.addSInt(getActiveDIE(), dwarf::DW_FORM_sdata, Value);
 }
 
 void DIEDwarfExpression::emitUnsigned(uint64_t Value) {
-  CU.addUInt(DIE, dwarf::DW_FORM_udata, Value);
+  CU.addUInt(getActiveDIE(), dwarf::DW_FORM_udata, Value);
 }
 
 void DIEDwarfExpression::emitData1(uint8_t Value) {
-  CU.addUInt(DIE, dwarf::DW_FORM_data1, Value);
+  CU.addUInt(getActiveDIE(), dwarf::DW_FORM_data1, Value);
 }
 
 void DIEDwarfExpression::emitBaseTypeRef(uint64_t Idx) {
-  CU.addBaseTypeRef(DIE, Idx);
+  CU.addBaseTypeRef(getActiveDIE(), Idx);
 }
 
+void DIEDwarfExpression::enableTemporaryBuffer() {
+  assert(!IsBuffering && "Already buffering?");
+  IsBuffering = true;
+}
+
+void DIEDwarfExpression::disableTemporaryBuffer() { IsBuffering = false; }
+
+unsigned DIEDwarfExpression::getTemporaryBufferSize() {
+  return TmpDIE.ComputeSize(&AP);
+}
+
+void DIEDwarfExpression::commitTemporaryBuffer() { OutDIE.takeValues(TmpDIE); }
+
 bool DIEDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI,
                                          unsigned MachineReg) {
   return MachineReg == TRI.getFrameRegister(*AP.MF);
@@ -205,6 +216,10 @@ void DwarfUnit::insertDIE(const DINode *Desc, DIE *D) {
   MDNodeToDieMap.insert(std::make_pair(Desc, D));
 }
 
+void DwarfUnit::insertDIE(DIE *D) {
+  MDNodeToDieMap.insert(std::make_pair(nullptr, D));
+}
+
 void DwarfUnit::addFlag(DIE &Die, dwarf::Attribute Attribute) {
   if (DD->getDwarfVersion() >= 4)
     Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_flag_present,
@@ -718,7 +733,7 @@ std::string DwarfUnit::getParentContextString(const DIScope *Context) const {
     return "";
 
   // FIXME: Decide whether to implement this for non-C++ languages.
-  if (getLanguage() != dwarf::DW_LANG_C_plus_plus)
+  if (!dwarf::isCPlusPlus((dwarf::SourceLanguage)getLanguage()))
     return "";
 
   std::string CS;
@@ -942,6 +957,9 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
     if (CTy->isAppleBlockExtension())
       addFlag(Buffer, dwarf::DW_AT_APPLE_block);
 
+    if (CTy->getExportSymbols())
+      addFlag(Buffer, dwarf::DW_AT_export_symbols);
+
     // This is outside the DWARF spec, but GDB expects a DW_AT_containing_type
     // inside C++ composite types to point to the base class with the vtable.
     // Rust uses DW_AT_containing_type to link a vtable to the type
@@ -1696,15 +1714,6 @@ void DwarfUnit::addRnglistsBase() {
                   TLOF.getDwarfRnglistsSection()->getBeginSymbol());
 }
 
-void DwarfUnit::addLoclistsBase() {
-  assert(DD->getDwarfVersion() >= 5 &&
-         "DW_AT_loclists_base requires DWARF version 5 or later");
-  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
-  addSectionLabel(getUnitDie(), dwarf::DW_AT_loclists_base,
-                  DU->getLoclistsTableBaseSym(),
-                  TLOF.getDwarfLoclistsSection()->getBeginSymbol());
-}
-
 void DwarfTypeUnit::finishNonUnitTypeDIE(DIE& D, const DICompositeType *CTy) {
   addFlag(D, dwarf::DW_AT_declaration);
   StringRef Name = CTy->getName();
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 56c934a35ae8..46c52a1faf4b 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -127,6 +127,8 @@ public:
   /// the mappings are kept in DwarfDebug.
   void insertDIE(const DINode *Desc, DIE *D);
 
+  void insertDIE(DIE *D);
+
   /// Add a flag that is true to the DIE.
   void addFlag(DIE &Die, dwarf::Attribute Attribute);
 
@@ -214,15 +216,6 @@ public:
   /// Add thrown types.
   void addThrownTypes(DIE &Die, DINodeArray ThrownTypes);
 
-  // FIXME: Should be reformulated in terms of addComplexAddress.
-  /// Start with the address based on the location provided, and generate the
-  /// DWARF information necessary to find the actual Block variable (navigating
-  /// the Block struct) based on the starting location.  Add the DWARF
-  /// information to the die.  Obsolete, please use addComplexAddress instead.
-  void addBlockByrefAddress(const DbgVariable &DV, DIE &Die,
-                            dwarf::Attribute Attribute,
-                            const MachineLocation &Location);
-
   /// Add a new type attribute to the specified entity.
   ///
   /// This takes and attribute parameter because DW_AT_friend attributes are
@@ -279,9 +272,6 @@ public:
   /// Add the DW_AT_rnglists_base attribute to the unit DIE.
   void addRnglistsBase();
 
-  /// Add the DW_AT_loclists_base attribute to the unit DIE.
-  void addLoclistsBase();
-
   virtual DwarfCompileUnit &getCU() = 0;
 
   void constructTypeDIE(DIE &Buffer, const DICompositeType *CTy);
diff --git a/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index 99e3687b36b8..31dfaaac836e 100644
--- a/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -426,7 +426,7 @@ MCSymbol *EHStreamer::emitExceptionTable() {
   // EHABI). In this case LSDASection will be NULL.
   if (LSDASection)
     Asm->OutStreamer->SwitchSection(LSDASection);
-  Asm->EmitAlignment(2);
+  Asm->EmitAlignment(Align(4));
 
   // Emit the LSDA.
   MCSymbol *GCCETSym =
@@ -602,11 +602,11 @@ MCSymbol *EHStreamer::emitExceptionTable() {
   }
 
   if (HaveTTData) {
-    Asm->EmitAlignment(2);
+    Asm->EmitAlignment(Align(4));
     emitTypeInfos(TTypeEncoding, TTBaseLabel);
   }
 
-  Asm->EmitAlignment(2);
+  Asm->EmitAlignment(Align(4));
   return GCCETSym;
 }
 
diff --git a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
index 39392b79e960..3849644d1584 100644
--- a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
@@ -72,7 +72,7 @@ void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
      **/
 
     // Align to address width.
-    AP.EmitAlignment(IntPtrSize == 4 ? 2 : 3);
+    AP.EmitAlignment(IntPtrSize == 4 ? Align(4) : Align(8));
 
     // Emit PointCount.
     OS.AddComment("safe point count");
diff --git a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
index 3145cc90dc73..b4eda5fa8c58 100644
--- a/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -129,7 +129,7 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
     report_fatal_error(" Too much descriptor for ocaml GC");
   }
   AP.emitInt16(NumDescriptors);
-  AP.EmitAlignment(IntPtrSize == 4 ? 2 : 3);
+  AP.EmitAlignment(IntPtrSize == 4 ? Align(4) : Align(8));
 
   for (GCModuleInfo::FuncInfoVec::iterator I = Info.funcinfo_begin(),
                                            IE = Info.funcinfo_end();
@@ -180,7 +180,7 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
         AP.emitInt16(K->StackOffset);
       }
 
-      AP.EmitAlignment(IntPtrSize == 4 ? 2 : 3);
+      AP.EmitAlignment(IntPtrSize == 4 ? Align(4) : Align(8));
     }
   }
 }
diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp
index 155e91ce61a1..0398675577cd 100644
--- a/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -982,8 +982,7 @@ void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
   OS.EmitValueToAlignment(4);
   OS.EmitLabel(LSDALabel);
 
-  const Function *Per =
-      dyn_cast<Function>(F.getPersonalityFn()->stripPointerCasts());
+  const auto *Per = cast<Function>(F.getPersonalityFn()->stripPointerCasts());
   StringRef PerName = Per->getName();
   int BaseState = -1;
   if (PerName == "_except_handler4") {
diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp
index dc7eaf6a5fe7..27b298dcf6af 100644
--- a/lib/CodeGen/AtomicExpandPass.cpp
+++ b/lib/CodeGen/AtomicExpandPass.cpp
@@ -382,7 +382,7 @@ LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
 
   auto *NewLI = Builder.CreateLoad(NewTy, NewAddr);
-  NewLI->setAlignment(LI->getAlignment());
+  NewLI->setAlignment(MaybeAlign(LI->getAlignment()));
   NewLI->setVolatile(LI->isVolatile());
   NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
   LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
@@ -469,7 +469,7 @@ StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
 
   StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr);
-  NewSI->setAlignment(SI->getAlignment());
+  NewSI->setAlignment(MaybeAlign(SI->getAlignment()));
   NewSI->setVolatile(SI->isVolatile());
   NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID());
   LLVM_DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
@@ -1376,7 +1376,7 @@ Value *AtomicExpand::insertRMWCmpXchgLoop(
   Builder.SetInsertPoint(BB);
   LoadInst *InitLoaded = Builder.CreateLoad(ResultTy, Addr);
   // Atomics require at least natural alignment.
-  InitLoaded->setAlignment(ResultTy->getPrimitiveSizeInBits() / 8);
+  InitLoaded->setAlignment(MaybeAlign(ResultTy->getPrimitiveSizeInBits() / 8));
   Builder.CreateBr(LoopBB);
 
   // Start the main loop block now that we've taken care of the preliminaries.
@@ -1711,7 +1711,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   // 'expected' argument, if present.
   if (CASExpected) {
     AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType());
-    AllocaCASExpected->setAlignment(AllocaAlignment);
+    AllocaCASExpected->setAlignment(MaybeAlign(AllocaAlignment));
     unsigned AllocaAS =  AllocaCASExpected->getType()->getPointerAddressSpace();
 
     AllocaCASExpected_i8 =
@@ -1730,7 +1730,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
       Args.push_back(IntValue);
     } else {
       AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType());
-      AllocaValue->setAlignment(AllocaAlignment);
+      AllocaValue->setAlignment(MaybeAlign(AllocaAlignment));
       AllocaValue_i8 =
           Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx));
       Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64);
@@ -1742,7 +1742,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   // 'ret' argument.
   if (!CASExpected && HasResult && !UseSizedLibcall) {
     AllocaResult = AllocaBuilder.CreateAlloca(I->getType());
-    AllocaResult->setAlignment(AllocaAlignment);
+    AllocaResult->setAlignment(MaybeAlign(AllocaAlignment));
     unsigned AllocaAS =  AllocaResult->getType()->getPointerAddressSpace();
     AllocaResult_i8 =
       Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx, AllocaAS));
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index fb54b5d6c8d8..455916eeb82f 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -129,9 +129,10 @@ bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
       getAnalysis<MachineBlockFrequencyInfo>());
   BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true, MBBFreqInfo,
                       getAnalysis<MachineBranchProbabilityInfo>());
-  return Folder.OptimizeFunction(MF, MF.getSubtarget().getInstrInfo(),
-                                 MF.getSubtarget().getRegisterInfo(),
-                                 getAnalysisIfAvailable<MachineModuleInfo>());
+  auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
+  return Folder.OptimizeFunction(
+      MF, MF.getSubtarget().getInstrInfo(), MF.getSubtarget().getRegisterInfo(),
+      MMIWP ? &MMIWP->getMMI() : nullptr);
 }
 
 BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
@@ -161,6 +162,11 @@ void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
   // Avoid matching if this pointer gets reused.
   TriedMerging.erase(MBB);
 
+  // Update call site info.
+  std::for_each(MBB->begin(), MBB->end(), [MF](const MachineInstr &MI) {
+    if (MI.isCall(MachineInstr::IgnoreBundle))
+      MF->eraseCallSiteInfo(&MI);
+  });
   // Remove the block.
   MF->erase(MBB);
   EHScopeMembership.erase(MBB);
@@ -1306,6 +1312,8 @@ static bool IsBranchOnlyBlock(MachineBasicBlock *MBB) {
 /// result in infinite loops.
 static bool IsBetterFallthrough(MachineBasicBlock *MBB1,
                                 MachineBasicBlock *MBB2) {
+  assert(MBB1 && MBB2 && "Unknown MachineBasicBlock");
+
   // Right now, we use a simple heuristic.  If MBB2 ends with a call, and
   // MBB1 doesn't, we prefer to fall through into MBB1.  This allows us to
   // optimize branches that branch to either a return block or an assert block
@@ -1843,7 +1851,7 @@ static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
 template <class Container>
 static void addRegAndItsAliases(unsigned Reg, const TargetRegisterInfo *TRI,
                                 Container &Set) {
-  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+  if (Register::isPhysicalRegister(Reg)) {
     for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
       Set.insert(*AI);
   } else {
@@ -1871,7 +1879,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
   for (const MachineOperand &MO : Loc->operands()) {
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
     if (MO.isUse()) {
@@ -1909,7 +1917,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
       return Loc;
     if (!MO.isReg() || MO.isUse())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
     if (Uses.count(Reg)) {
@@ -1937,14 +1945,14 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
   for (const MachineOperand &MO : PI->operands()) {
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
     if (MO.isUse()) {
       addRegAndItsAliases(Reg, TRI, Uses);
     } else {
       if (Uses.erase(Reg)) {
-        if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        if (Register::isPhysicalRegister(Reg)) {
           for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
             Uses.erase(*SubRegs); // Use sub-registers to be conservative
         }
@@ -2010,7 +2018,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       }
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg)
         continue;
       if (MO.isDef()) {
@@ -2060,13 +2068,13 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
     for (const MachineOperand &MO : TIB->operands()) {
       if (!MO.isReg() || !MO.isUse() || !MO.isKill())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg)
         continue;
       if (!AllDefsSet.count(Reg)) {
         continue;
       }
-      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (Register::isPhysicalRegister(Reg)) {
         for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
           ActiveDefsSet.erase(*AI);
       } else {
@@ -2078,8 +2086,8 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
     for (const MachineOperand &MO : TIB->operands()) {
       if (!MO.isReg() || !MO.isDef() || MO.isDead())
         continue;
-      unsigned Reg = MO.getReg();
-      if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg))
+      Register Reg = MO.getReg();
+      if (!Reg || Register::isVirtualRegister(Reg))
         continue;
       addRegAndItsAliases(Reg, TRI, ActiveDefsSet);
       addRegAndItsAliases(Reg, TRI, AllDefsSet);
diff --git a/lib/CodeGen/BranchRelaxation.cpp b/lib/CodeGen/BranchRelaxation.cpp
index 3ad6266d4f35..6efdc9efa968 100644
--- a/lib/CodeGen/BranchRelaxation.cpp
+++ b/lib/CodeGen/BranchRelaxation.cpp
@@ -64,19 +64,18 @@ class BranchRelaxation : public MachineFunctionPass {
     /// Compute the offset immediately following this block. \p MBB is the next
     /// block.
     unsigned postOffset(const MachineBasicBlock &MBB) const {
-      unsigned PO = Offset + Size;
-      unsigned Align = MBB.getAlignment();
-      if (Align == 0)
+      const unsigned PO = Offset + Size;
+      const Align Alignment = MBB.getAlignment();
+      if (Alignment == 1)
         return PO;
 
-      unsigned AlignAmt = 1 << Align;
-      unsigned ParentAlign = MBB.getParent()->getAlignment();
-      if (Align <= ParentAlign)
-        return PO + OffsetToAlignment(PO, AlignAmt);
+      const Align ParentAlign = MBB.getParent()->getAlignment();
+      if (Alignment <= ParentAlign)
+        return PO + offsetToAlignment(PO, Alignment);
 
       // The alignment of this MBB is larger than the function's alignment, so we
       // can't tell whether or not it will insert nops. Assume that it will.
-      return PO + AlignAmt + OffsetToAlignment(PO, AlignAmt);
+      return PO + Alignment.value() + offsetToAlignment(PO, Alignment);
     }
   };
 
@@ -128,9 +127,8 @@ void BranchRelaxation::verify() {
 #ifndef NDEBUG
   unsigned PrevNum = MF->begin()->getNumber();
   for (MachineBasicBlock &MBB : *MF) {
-    unsigned Align = MBB.getAlignment();
-    unsigned Num = MBB.getNumber();
-    assert(BlockInfo[Num].Offset % (1u << Align) == 0);
+    const unsigned Num = MBB.getNumber();
+    assert(isAligned(MBB.getAlignment(), BlockInfo[Num].Offset));
     assert(!Num || BlockInfo[PrevNum].postOffset(MBB) <= BlockInfo[Num].Offset);
     assert(BlockInfo[Num].Size == computeBlockSize(MBB));
     PrevNum = Num;
@@ -143,7 +141,7 @@ void BranchRelaxation::verify() {
 LLVM_DUMP_METHOD void BranchRelaxation::dumpBBs() {
   for (auto &MBB : *MF) {
     const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()];
-    dbgs() << format("%bb.%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset)
+    dbgs() << format("%%bb.%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset)
            << format("size=%#x\n", BBI.Size);
   }
 }
diff --git a/lib/CodeGen/BreakFalseDeps.cpp b/lib/CodeGen/BreakFalseDeps.cpp
index cc4b2caa9bed..709164e5f178 100644
--- a/lib/CodeGen/BreakFalseDeps.cpp
+++ b/lib/CodeGen/BreakFalseDeps.cpp
@@ -9,12 +9,11 @@
 /// \file Break False Dependency pass.
 ///
 /// Some instructions have false dependencies which cause unnecessary stalls.
-/// For exmaple, instructions that only write part of a register, and implicitly
-/// need to read the other parts of the register.  This may cause unwanted
+/// For example, instructions may write part of a register and implicitly
+/// need to read the other parts of the register. This may cause unwanted
 /// stalls preventing otherwise unrelated instructions from executing in
 /// parallel in an out-of-order CPU.
-/// This pass is aimed at identifying and avoiding these depepndencies when
-/// possible.
+/// This pass is aimed at identifying and avoiding these dependencies.
 //
 //===----------------------------------------------------------------------===//
 
@@ -24,6 +23,7 @@
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
 
 
 using namespace llvm;
@@ -109,7 +109,7 @@ bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
   MachineOperand &MO = MI->getOperand(OpIdx);
   assert(MO.isUndef() && "Expected undef machine operand");
 
-  unsigned OriginalReg = MO.getReg();
+  Register OriginalReg = MO.getReg();
 
   // Update only undef operands that have reg units that are mapped to one root.
   for (MCRegUnitIterator Unit(OriginalReg, TRI); Unit.isValid(); ++Unit) {
@@ -162,7 +162,7 @@ bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
 
 bool BreakFalseDeps::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
                                            unsigned Pref) {
-  unsigned reg = MI->getOperand(OpIdx).getReg();
+  Register reg = MI->getOperand(OpIdx).getReg();
   unsigned Clearance = RDA->getClearance(MI, reg);
   LLVM_DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
 
@@ -178,6 +178,7 @@ void BreakFalseDeps::processDefs(MachineInstr *MI) {
   assert(!MI->isDebugInstr() && "Won't process debug values");
 
   // Break dependence on undef uses. Do this before updating LiveRegs below.
+  // This can remove a false dependence with no additional instructions.
   unsigned OpNum;
   unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
   if (Pref) {
@@ -189,6 +190,11 @@ void BreakFalseDeps::processDefs(MachineInstr *MI) {
       UndefReads.push_back(std::make_pair(MI, OpNum));
   }
 
+  // The code below allows the target to create a new instruction to break the
+  // dependence. That opposes the goal of minimizing size, so bail out now.
+  if (MF->getFunction().hasMinSize())
+    return;
+
   const MCInstrDesc &MCID = MI->getDesc();
   for (unsigned i = 0,
     e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs();
@@ -209,6 +215,11 @@ void BreakFalseDeps::processUndefReads(MachineBasicBlock *MBB) {
   if (UndefReads.empty())
     return;
 
+  // The code below allows the target to create a new instruction to break the
+  // dependence. That opposes the goal of minimizing size, so bail out now.
+  if (MF->getFunction().hasMinSize())
+    return;
+
   // Collect this block's live out register units.
   LiveRegSet.init(*TRI);
   // We do not need to care about pristine registers as they are just preserved
diff --git a/lib/CodeGen/CalcSpillWeights.cpp b/lib/CodeGen/CalcSpillWeights.cpp
index 7164fdfb7886..bf97aaee3665 100644
--- a/lib/CodeGen/CalcSpillWeights.cpp
+++ b/lib/CodeGen/CalcSpillWeights.cpp
@@ -40,7 +40,7 @@ void llvm::calculateSpillWeightsAndHints(LiveIntervals &LIS,
   MachineRegisterInfo &MRI = MF.getRegInfo();
   VirtRegAuxInfo VRAI(MF, LIS, VRM, MLI, MBFI, norm);
   for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     if (MRI.reg_nodbg_empty(Reg))
       continue;
     VRAI.calculateSpillWeightAndHint(LIS.getInterval(Reg));
@@ -48,10 +48,11 @@ void llvm::calculateSpillWeightsAndHints(LiveIntervals &LIS,
 }
 
 // Return the preferred allocation register for reg, given a COPY instruction.
-static unsigned copyHint(const MachineInstr *mi, unsigned reg,
+static Register copyHint(const MachineInstr *mi, unsigned reg,
                          const TargetRegisterInfo &tri,
                          const MachineRegisterInfo &mri) {
-  unsigned sub, hreg, hsub;
+  unsigned sub, hsub;
+  Register hreg;
   if (mi->getOperand(0).getReg() == reg) {
     sub = mi->getOperand(0).getSubReg();
     hreg = mi->getOperand(1).getReg();
@@ -65,11 +66,11 @@ static unsigned copyHint(const MachineInstr *mi, unsigned reg,
   if (!hreg)
     return 0;
 
-  if (TargetRegisterInfo::isVirtualRegister(hreg))
-    return sub == hsub ? hreg : 0;
+  if (Register::isVirtualRegister(hreg))
+    return sub == hsub ? hreg : Register();
 
   const TargetRegisterClass *rc = mri.getRegClass(reg);
-  unsigned CopiedPReg = (hsub ? tri.getSubReg(hreg, hsub) : hreg);
+  Register CopiedPReg = (hsub ? tri.getSubReg(hreg, hsub) : hreg);
   if (rc->contains(CopiedPReg))
     return CopiedPReg;
 
@@ -112,7 +113,7 @@ static bool isRematerializable(const LiveInterval &LI,
 
         // If the original (pre-splitting) registers match this
         // copy came from a split.
-        if (!TargetRegisterInfo::isVirtualRegister(Reg) ||
+        if (!Register::isVirtualRegister(Reg) ||
             VRM->getOriginal(Reg) != Original)
           return false;
 
@@ -243,7 +244,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start,
     // Get allocation hints from copies.
     if (!mi->isCopy())
       continue;
-    unsigned hint = copyHint(mi, li.reg, tri, mri);
+    Register hint = copyHint(mi, li.reg, tri, mri);
     if (!hint)
       continue;
     // Force hweight onto the stack so that x86 doesn't add hidden precision,
@@ -251,8 +252,9 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start,
     //
     // FIXME: we probably shouldn't use floats at all.
     volatile float hweight = Hint[hint] += weight;
-    if (TargetRegisterInfo::isVirtualRegister(hint) || mri.isAllocatable(hint))
-      CopyHints.insert(CopyHint(hint, hweight, tri.isPhysicalRegister(hint)));
+    if (Register::isVirtualRegister(hint) || mri.isAllocatable(hint))
+      CopyHints.insert(
+          CopyHint(hint, hweight, Register::isPhysicalRegister(hint)));
   }
 
   Hint.clear();
diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
index 497fcb147849..a397039180a4 100644
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -32,7 +32,6 @@ CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
       TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C) {
   // No stack is used.
   StackOffset = 0;
-  MaxStackArgAlign = 1;
 
   clearByValRegsInfo();
   UsedRegs.resize((TRI.getNumRegs()+31)/32);
@@ -41,20 +40,21 @@ CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
 /// Allocate space on the stack large enough to pass an argument by value.
 /// The size and alignment information of the argument is encoded in
 /// its parameter attribute.
-void CCState::HandleByVal(unsigned ValNo, MVT ValVT,
-                          MVT LocVT, CCValAssign::LocInfo LocInfo,
-                          int MinSize, int MinAlign,
-                          ISD::ArgFlagsTy ArgFlags) {
-  unsigned Align = ArgFlags.getByValAlign();
+void CCState::HandleByVal(unsigned ValNo, MVT ValVT, MVT LocVT,
+                          CCValAssign::LocInfo LocInfo, int MinSize,
+                          int MinAlignment, ISD::ArgFlagsTy ArgFlags) {
+  Align MinAlign(MinAlignment);
+  Align Alignment(ArgFlags.getByValAlign());
   unsigned Size  = ArgFlags.getByValSize();
   if (MinSize > (int)Size)
     Size = MinSize;
-  if (MinAlign > (int)Align)
-    Align = MinAlign;
-  ensureMaxAlignment(Align);
-  MF.getSubtarget().getTargetLowering()->HandleByVal(this, Size, Align);
+  if (MinAlign > Alignment)
+    Alignment = MinAlign;
+  ensureMaxAlignment(Alignment);
+  MF.getSubtarget().getTargetLowering()->HandleByVal(this, Size,
+                                                     Alignment.value());
   Size = unsigned(alignTo(Size, MinAlign));
-  unsigned Offset = AllocateStack(Size, Align);
+  unsigned Offset = AllocateStack(Size, Alignment.value());
   addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
 }
 
@@ -90,13 +90,8 @@ CCState::AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT ArgVT = Ins[i].VT;
     ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
-    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
-#ifndef NDEBUG
-      dbgs() << "Formal argument #" << i << " has unhandled type "
-             << EVT(ArgVT).getEVTString() << '\n';
-#endif
-      llvm_unreachable(nullptr);
-    }
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this))
+      report_fatal_error("unable to allocate function argument #" + Twine(i));
   }
 }
 
@@ -122,13 +117,8 @@ void CCState::AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
   for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
     MVT VT = Outs[i].VT;
     ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this)) {
-#ifndef NDEBUG
-      dbgs() << "Return operand #" << i << " has unhandled type "
-             << EVT(VT).getEVTString() << '\n';
-#endif
-      llvm_unreachable(nullptr);
-    }
+    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this))
+      report_fatal_error("unable to allocate function return #" + Twine(i));
   }
 }
 
@@ -209,7 +199,7 @@ static bool isValueTypeInRegForCC(CallingConv::ID CC, MVT VT) {
 void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs,
                                           MVT VT, CCAssignFn Fn) {
   unsigned SavedStackOffset = StackOffset;
-  unsigned SavedMaxStackArgAlign = MaxStackArgAlign;
+  Align SavedMaxStackArgAlign = MaxStackArgAlign;
   unsigned NumLocs = Locs.size();
 
   // Set the 'inreg' flag if it is used for this calling convention.
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index c37ed57781d4..ad9525f927e8 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -28,6 +28,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeDetectDeadLanesPass(Registry);
   initializeDwarfEHPreparePass(Registry);
   initializeEarlyIfConverterPass(Registry);
+  initializeEarlyIfPredicatorPass(Registry);
   initializeEarlyMachineLICMPass(Registry);
   initializeEarlyTailDuplicatePass(Registry);
   initializeExpandMemCmpPassPass(Registry);
@@ -53,6 +54,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeLocalStackSlotPassPass(Registry);
   initializeLowerIntrinsicsPass(Registry);
   initializeMIRCanonicalizerPass(Registry);
+  initializeMIRNamerPass(Registry);
   initializeMachineBlockFrequencyInfoPass(Registry);
   initializeMachineBlockPlacementPass(Registry);
   initializeMachineBlockPlacementStatsPass(Registry);
@@ -63,10 +65,11 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineFunctionPrinterPassPass(Registry);
   initializeMachineLICMPass(Registry);
   initializeMachineLoopInfoPass(Registry);
-  initializeMachineModuleInfoPass(Registry);
+  initializeMachineModuleInfoWrapperPassPass(Registry);
   initializeMachineOptimizationRemarkEmitterPassPass(Registry);
   initializeMachineOutlinerPass(Registry);
   initializeMachinePipelinerPass(Registry);
+  initializeModuloScheduleTestPass(Registry);
   initializeMachinePostDominatorTreePass(Registry);
   initializeMachineRegionInfoPassPass(Registry);
   initializeMachineSchedulerPass(Registry);
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 52b4bbea012b..fa4432ea23ec 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -344,7 +344,7 @@ class TypePromotionTransaction;
     // Get the DominatorTree, building if necessary.
     DominatorTree &getDT(Function &F) {
       if (!DT)
-        DT = llvm::make_unique<DominatorTree>(F);
+        DT = std::make_unique<DominatorTree>(F);
       return *DT;
     }
 
@@ -424,7 +424,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
     TLI = SubtargetInfo->getTargetLowering();
     TRI = SubtargetInfo->getRegisterInfo();
   }
-  TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   BPI.reset(new BranchProbabilityInfo(F, *LI));
@@ -1524,7 +1524,7 @@ SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
                      const TargetLowering &TLI, const DataLayout &DL) {
   BasicBlock *UserBB = User->getParent();
   DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
-  TruncInst *TruncI = dyn_cast<TruncInst>(User);
+  auto *TruncI = cast<TruncInst>(User);
   bool MadeChange = false;
 
   for (Value::user_iterator TruncUI = TruncI->user_begin(),
@@ -1682,10 +1682,11 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
     TheUse = InsertedShift;
   }
 
-  // If we removed all uses, nuke the shift.
+  // If we removed all uses, or there are none, nuke the shift.
   if (ShiftI->use_empty()) {
     salvageDebugInfo(*ShiftI);
     ShiftI->eraseFromParent();
+    MadeChange = true;
   }
 
   return MadeChange;
@@ -1811,7 +1812,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
       AllocaInst *AI;
       if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign &&
           DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
-        AI->setAlignment(PrefAlign);
+        AI->setAlignment(MaybeAlign(PrefAlign));
       // Global variables can only be aligned if they are defined in this
       // object (i.e. they are uniquely initialized in this object), and
       // over-aligning global variables that have an explicit section is
@@ -1821,7 +1822,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
           GV->getPointerAlignment(*DL) < PrefAlign &&
           DL->getTypeAllocSize(GV->getValueType()) >=
               MinSize + Offset2)
-        GV->setAlignment(PrefAlign);
+        GV->setAlignment(MaybeAlign(PrefAlign));
     }
     // If this is a memcpy (or similar) then we may be able to improve the
     // alignment
@@ -1867,24 +1868,10 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
       });
       return true;
     }
-    case Intrinsic::objectsize: {
-      // Lower all uses of llvm.objectsize.*
-      Value *RetVal =
-          lowerObjectSizeCall(II, *DL, TLInfo, /*MustSucceed=*/true);
-
-      resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
-        replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
-      });
-      return true;
-    }
-    case Intrinsic::is_constant: {
-      // If is_constant hasn't folded away yet, lower it to false now.
-      Constant *RetVal = ConstantInt::get(II->getType(), 0);
-      resetIteratorIfInvalidatedWhileCalling(BB, [&]() {
-        replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
-      });
-      return true;
-    }
+    case Intrinsic::objectsize:
+      llvm_unreachable("llvm.objectsize.* should have been lowered already");
+    case Intrinsic::is_constant:
+      llvm_unreachable("llvm.is.constant.* should have been lowered already");
     case Intrinsic::aarch64_stlxr:
     case Intrinsic::aarch64_stxr: {
       ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
@@ -2024,17 +2011,18 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT
   /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail
   /// call.
   const Function *F = BB->getParent();
-  SmallVector<CallInst*, 4> TailCalls;
+  SmallVector<BasicBlock*, 4> TailCallBBs;
   if (PN) {
     for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
       // Look through bitcasts.
       Value *IncomingVal = PN->getIncomingValue(I)->stripPointerCasts();
       CallInst *CI = dyn_cast<CallInst>(IncomingVal);
+      BasicBlock *PredBB = PN->getIncomingBlock(I);
       // Make sure the phi value is indeed produced by the tail call.
-      if (CI && CI->hasOneUse() && CI->getParent() == PN->getIncomingBlock(I) &&
+      if (CI && CI->hasOneUse() && CI->getParent() == PredBB &&
           TLI->mayBeEmittedAsTailCall(CI) &&
           attributesPermitTailCall(F, CI, RetI, *TLI))
-        TailCalls.push_back(CI);
+        TailCallBBs.push_back(PredBB);
     }
   } else {
     SmallPtrSet<BasicBlock*, 4> VisitedBBs;
@@ -2052,24 +2040,20 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT
       CallInst *CI = dyn_cast<CallInst>(&*RI);
       if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI) &&
           attributesPermitTailCall(F, CI, RetI, *TLI))
-        TailCalls.push_back(CI);
+        TailCallBBs.push_back(*PI);
     }
   }
 
   bool Changed = false;
-  for (unsigned i = 0, e = TailCalls.size(); i != e; ++i) {
-    CallInst *CI = TailCalls[i];
-    CallSite CS(CI);
-
+  for (auto const &TailCallBB : TailCallBBs) {
     // Make sure the call instruction is followed by an unconditional branch to
     // the return block.
-    BasicBlock *CallBB = CI->getParent();
-    BranchInst *BI = dyn_cast<BranchInst>(CallBB->getTerminator());
+    BranchInst *BI = dyn_cast<BranchInst>(TailCallBB->getTerminator());
     if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB)
       continue;
 
-    // Duplicate the return into CallBB.
-    (void)FoldReturnIntoUncondBranch(RetI, BB, CallBB);
+    // Duplicate the return into TailCallBB.
+    (void)FoldReturnIntoUncondBranch(RetI, BB, TailCallBB);
     ModifiedDT = Changed = true;
     ++NumRetsDup;
   }
@@ -2683,26 +2667,26 @@ private:
 
 void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
                                           Value *NewVal) {
-  Actions.push_back(llvm::make_unique<TypePromotionTransaction::OperandSetter>(
+  Actions.push_back(std::make_unique<TypePromotionTransaction::OperandSetter>(
       Inst, Idx, NewVal));
 }
 
 void TypePromotionTransaction::eraseInstruction(Instruction *Inst,
                                                 Value *NewVal) {
   Actions.push_back(
-      llvm::make_unique<TypePromotionTransaction::InstructionRemover>(
+      std::make_unique<TypePromotionTransaction::InstructionRemover>(
           Inst, RemovedInsts, NewVal));
 }
 
 void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst,
                                                   Value *New) {
   Actions.push_back(
-      llvm::make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New));
+      std::make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New));
 }
 
 void TypePromotionTransaction::mutateType(Instruction *Inst, Type *NewTy) {
   Actions.push_back(
-      llvm::make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy));
+      std::make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy));
 }
 
 Value *TypePromotionTransaction::createTrunc(Instruction *Opnd,
@@ -2732,7 +2716,7 @@ Value *TypePromotionTransaction::createZExt(Instruction *Inst,
 void TypePromotionTransaction::moveBefore(Instruction *Inst,
                                           Instruction *Before) {
   Actions.push_back(
-      llvm::make_unique<TypePromotionTransaction::InstructionMoveBefore>(
+      std::make_unique<TypePromotionTransaction::InstructionMoveBefore>(
           Inst, Before));
 }
 
@@ -3048,7 +3032,7 @@ public:
       To = dyn_cast<PHINode>(OldReplacement);
       OldReplacement = Get(From);
     }
-    assert(Get(To) == To && "Replacement PHI node is already replaced.");
+    assert(To && Get(To) == To && "Replacement PHI node is already replaced.");
     Put(From, To);
     From->replaceAllUsesWith(To);
     AllPhiNodes.erase(From);
@@ -3334,7 +3318,7 @@ private:
         // So the values are different and does not match. So we need them to
         // match. (But we register no more than one match per PHI node, so that
         // we won't later try to replace them twice.)
-        if (!MatchedPHIs.insert(FirstPhi).second)
+        if (MatchedPHIs.insert(FirstPhi).second)
           Matcher.insert({ FirstPhi, SecondPhi });
         // But me must check it.
         WorkList.push_back({ FirstPhi, SecondPhi });
@@ -3412,11 +3396,10 @@ private:
         Select->setFalseValue(ST.Get(Map[FalseValue]));
       } else {
         // Must be a Phi node then.
-        PHINode *PHI = cast<PHINode>(V);
-        auto *CurrentPhi = dyn_cast<PHINode>(Current);
+        auto *PHI = cast<PHINode>(V);
         // Fill the Phi node with values from predecessors.
         for (auto B : predecessors(PHI->getParent())) {
-          Value *PV = CurrentPhi->getIncomingValueForBlock(B);
+          Value *PV = cast<PHINode>(Current)->getIncomingValueForBlock(B);
           assert(Map.find(PV) != Map.end() && "No predecessor Value!");
           PHI->addIncoming(ST.Get(Map[PV]), B);
         }
@@ -3785,13 +3768,11 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
   //          poisoned value                    regular value
   // It should be OK since undef covers valid value.
   if (Inst->getOpcode() == Instruction::Shl && Inst->hasOneUse()) {
-    const Instruction *ExtInst =
-        dyn_cast<const Instruction>(*Inst->user_begin());
+    const auto *ExtInst = cast<const Instruction>(*Inst->user_begin());
     if (ExtInst->hasOneUse()) {
-      const Instruction *AndInst =
-          dyn_cast<const Instruction>(*ExtInst->user_begin());
+      const auto *AndInst = dyn_cast<const Instruction>(*ExtInst->user_begin());
       if (AndInst && AndInst->getOpcode() == Instruction::And) {
-        const ConstantInt *Cst = dyn_cast<ConstantInt>(AndInst->getOperand(1));
+        const auto *Cst = dyn_cast<ConstantInt>(AndInst->getOperand(1));
         if (Cst &&
             Cst->getValue().isIntN(Inst->getType()->getIntegerBitWidth()))
           return true;
@@ -4793,8 +4774,8 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
                       << " for " << *MemoryInst << "\n");
     if (SunkAddr->getType() != Addr->getType())
       SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
-  } else if (AddrSinkUsingGEPs ||
-             (!AddrSinkUsingGEPs.getNumOccurrences() && TM && TTI->useAA())) {
+  } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() &&
+                                   TM && SubtargetInfo->addrSinkUsingGEPs())) {
     // By default, we use the GEP-based method when AA is used later. This
     // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
     LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
@@ -5816,7 +5797,7 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
     return false;
 
   IRBuilder<> Builder(Load->getNextNode());
-  auto *NewAnd = dyn_cast<Instruction>(
+  auto *NewAnd = cast<Instruction>(
       Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
   // Mark this instruction as "inserted by CGP", so that other
   // optimizations don't touch it.
@@ -6193,35 +6174,49 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
 
   // OpsToSink can contain multiple uses in a use chain (e.g.
   // (%u1 with %u1 = shufflevector), (%u2 with %u2 = zext %u1)). The dominating
-  // uses must come first, which means they are sunk first, temporarily creating
-  // invalid IR. This will be fixed once their dominated users are sunk and
-  // updated.
+  // uses must come first, so we process the ops in reverse order so as to not
+  // create invalid IR.
   BasicBlock *TargetBB = I->getParent();
   bool Changed = false;
   SmallVector<Use *, 4> ToReplace;
-  for (Use *U : OpsToSink) {
+  for (Use *U : reverse(OpsToSink)) {
     auto *UI = cast<Instruction>(U->get());
     if (UI->getParent() == TargetBB || isa<PHINode>(UI))
       continue;
     ToReplace.push_back(U);
   }
 
-  SmallPtrSet<Instruction *, 4> MaybeDead;
+  SetVector<Instruction *> MaybeDead;
+  DenseMap<Instruction *, Instruction *> NewInstructions;
+  Instruction *InsertPoint = I;
   for (Use *U : ToReplace) {
     auto *UI = cast<Instruction>(U->get());
     Instruction *NI = UI->clone();
+    NewInstructions[UI] = NI;
     MaybeDead.insert(UI);
     LLVM_DEBUG(dbgs() << "Sinking " << *UI << " to user " << *I << "\n");
-    NI->insertBefore(I);
+    NI->insertBefore(InsertPoint);
+    InsertPoint = NI;
     InsertedInsts.insert(NI);
-    U->set(NI);
+
+    // Update the use for the new instruction, making sure that we update the
+    // sunk instruction uses, if it is part of a chain that has already been
+    // sunk.
+    Instruction *OldI = cast<Instruction>(U->getUser());
+    if (NewInstructions.count(OldI))
+      NewInstructions[OldI]->setOperand(U->getOperandNo(), NI);
+    else
+      U->set(NI);
     Changed = true;
   }
 
   // Remove instructions that are dead after sinking.
-  for (auto *I : MaybeDead)
-    if (!I->hasNUsesOrMore(1))
+  for (auto *I : MaybeDead) {
+    if (!I->hasNUsesOrMore(1)) {
+      LLVM_DEBUG(dbgs() << "Removing dead instruction: " << *I << "\n");
       I->eraseFromParent();
+    }
+  }
 
   return Changed;
 }
@@ -7106,7 +7101,6 @@ bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
     for (auto &I : reverse(BB)) {
       if (makeBitReverse(I, *DL, *TLI)) {
         MadeBitReverse = MadeChange = true;
-        ModifiedDT = true;
         break;
       }
     }
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 4144c243a341..702e7e244bce 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -187,7 +187,7 @@ void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr &MI) {
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0) continue;
     const TargetRegisterClass *NewRC = nullptr;
 
@@ -272,7 +272,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr &MI, unsigned Count) {
           }
 
       if (!MO.isReg()) continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (Reg == 0) continue;
       if (!MO.isDef()) continue;
 
@@ -303,7 +303,7 @@ void CriticalAntiDepBreaker::ScanInstruction(MachineInstr &MI, unsigned Count) {
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0) continue;
     if (!MO.isUse()) continue;
 
@@ -457,6 +457,7 @@ BreakAntiDependencies(const std::vector<SUnit> &SUnits,
     if (!Max || SU->getDepth() + SU->Latency > Max->getDepth() + Max->Latency)
       Max = SU;
   }
+  assert(Max && "Failed to find bottom of the critical path");
 
 #ifndef NDEBUG
   {
@@ -612,7 +613,7 @@ BreakAntiDependencies(const std::vector<SUnit> &SUnits,
       for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
         MachineOperand &MO = MI.getOperand(i);
         if (!MO.isReg()) continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (Reg == 0) continue;
         if (MO.isUse() && TRI->regsOverlap(AntiDepReg, Reg)) {
           AntiDepReg = 0;
diff --git a/lib/CodeGen/DFAPacketizer.cpp b/lib/CodeGen/DFAPacketizer.cpp
index b99be5d7a87c..a169c3cb16b2 100644
--- a/lib/CodeGen/DFAPacketizer.cpp
+++ b/lib/CodeGen/DFAPacketizer.cpp
@@ -23,6 +23,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
@@ -71,39 +73,13 @@ static DFAInput getDFAInsnInput(const std::vector<unsigned> &InsnClass) {
 
 // --------------------------------------------------------------------
 
-DFAPacketizer::DFAPacketizer(const InstrItineraryData *I,
-                             const DFAStateInput (*SIT)[2],
-                             const unsigned *SET):
-  InstrItins(I), DFAStateInputTable(SIT), DFAStateEntryTable(SET) {
-  // Make sure DFA types are large enough for the number of terms & resources.
-  static_assert((DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <=
-                    (8 * sizeof(DFAInput)),
-                "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAInput");
-  static_assert(
-      (DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <= (8 * sizeof(DFAStateInput)),
-      "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAStateInput");
-}
-
-// Read the DFA transition table and update CachedTable.
-//
-// Format of the transition tables:
-// DFAStateInputTable[][2] = pairs of <Input, Transition> for all valid
-//                           transitions
-// DFAStateEntryTable[i] = Index of the first entry in DFAStateInputTable
-//                         for the ith state
-//
-void DFAPacketizer::ReadTable(unsigned int state) {
-  unsigned ThisState = DFAStateEntryTable[state];
-  unsigned NextStateInTable = DFAStateEntryTable[state+1];
-  // Early exit in case CachedTable has already contains this
-  // state's transitions.
-  if (CachedTable.count(UnsignPair(state, DFAStateInputTable[ThisState][0])))
-    return;
-
-  for (unsigned i = ThisState; i < NextStateInTable; i++)
-    CachedTable[UnsignPair(state, DFAStateInputTable[i][0])] =
-      DFAStateInputTable[i][1];
-}
+// Make sure DFA types are large enough for the number of terms & resources.
+static_assert((DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <=
+              (8 * sizeof(DFAInput)),
+              "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAInput");
+static_assert(
+    (DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <= (8 * sizeof(DFAStateInput)),
+    "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAStateInput");
 
 // Return the DFAInput for an instruction class.
 DFAInput DFAPacketizer::getInsnInput(unsigned InsnClass) {
@@ -129,9 +105,7 @@ DFAInput DFAPacketizer::getInsnInput(const std::vector<unsigned> &InsnClass) {
 bool DFAPacketizer::canReserveResources(const MCInstrDesc *MID) {
   unsigned InsnClass = MID->getSchedClass();
   DFAInput InsnInput = getInsnInput(InsnClass);
-  UnsignPair StateTrans = UnsignPair(CurrentState, InsnInput);
-  ReadTable(CurrentState);
-  return CachedTable.count(StateTrans) != 0;
+  return A.canAdd(InsnInput);
 }
 
 // Reserve the resources occupied by a MCInstrDesc and change the current
@@ -139,10 +113,7 @@ bool DFAPacketizer::canReserveResources(const MCInstrDesc *MID) {
 void DFAPacketizer::reserveResources(const MCInstrDesc *MID) {
   unsigned InsnClass = MID->getSchedClass();
   DFAInput InsnInput = getInsnInput(InsnClass);
-  UnsignPair StateTrans = UnsignPair(CurrentState, InsnInput);
-  ReadTable(CurrentState);
-  assert(CachedTable.count(StateTrans) != 0);
-  CurrentState = CachedTable[StateTrans];
+  A.add(InsnInput);
 }
 
 // Check if the resources occupied by a machine instruction are available
@@ -159,19 +130,33 @@ void DFAPacketizer::reserveResources(MachineInstr &MI) {
   reserveResources(&MID);
 }
 
+unsigned DFAPacketizer::getUsedResources(unsigned InstIdx) {
+  ArrayRef<NfaPath> NfaPaths = A.getNfaPaths();
+  assert(!NfaPaths.empty() && "Invalid bundle!");
+  const NfaPath &RS = NfaPaths.front();
+
+  // RS stores the cumulative resources used up to and including the I'th
+  // instruction. The 0th instruction is the base case.
+  if (InstIdx == 0)
+    return RS[0];
+  // Return the difference between the cumulative resources used by InstIdx and
+  // its predecessor.
+  return RS[InstIdx] ^ RS[InstIdx - 1];
+}
+
 namespace llvm {
 
 // This class extends ScheduleDAGInstrs and overrides the schedule method
 // to build the dependence graph.
 class DefaultVLIWScheduler : public ScheduleDAGInstrs {
 private:
-  AliasAnalysis *AA;
+  AAResults *AA;
   /// Ordered list of DAG postprocessing steps.
   std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
 
 public:
   DefaultVLIWScheduler(MachineFunction &MF, MachineLoopInfo &MLI,
-                       AliasAnalysis *AA);
+                       AAResults *AA);
 
   // Actual scheduling work.
   void schedule() override;
@@ -189,7 +174,7 @@ protected:
 
 DefaultVLIWScheduler::DefaultVLIWScheduler(MachineFunction &MF,
                                            MachineLoopInfo &MLI,
-                                           AliasAnalysis *AA)
+                                           AAResults *AA)
     : ScheduleDAGInstrs(MF, &MLI), AA(AA) {
   CanHandleTerminators = true;
 }
@@ -207,9 +192,10 @@ void DefaultVLIWScheduler::schedule() {
 }
 
 VLIWPacketizerList::VLIWPacketizerList(MachineFunction &mf,
-                                       MachineLoopInfo &mli, AliasAnalysis *aa)
+                                       MachineLoopInfo &mli, AAResults *aa)
     : MF(mf), TII(mf.getSubtarget().getInstrInfo()), AA(aa) {
   ResourceTracker = TII->CreateTargetScheduleState(MF.getSubtarget());
+  ResourceTracker->setTrackResources(true);
   VLIWScheduler = new DefaultVLIWScheduler(MF, mli, AA);
 }
 
@@ -224,8 +210,11 @@ void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB,
   LLVM_DEBUG({
     if (!CurrentPacketMIs.empty()) {
       dbgs() << "Finalizing packet:\n";
-      for (MachineInstr *MI : CurrentPacketMIs)
-        dbgs() << " * " << *MI;
+      unsigned Idx = 0;
+      for (MachineInstr *MI : CurrentPacketMIs) {
+        unsigned R = ResourceTracker->getUsedResources(Idx++);
+        dbgs() << " * [res:0x" << utohexstr(R) << "] " << *MI;
+      }
     }
   });
   if (CurrentPacketMIs.size() > 1) {
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index 049ce7063307..9a537c859a67 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -75,8 +75,8 @@ bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const {
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
     if (MO.isReg() && MO.isDef()) {
-      unsigned Reg = MO.getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      Register Reg = MO.getReg();
+      if (Register::isPhysicalRegister(Reg)) {
         // Don't delete live physreg defs, or any reserved register defs.
         if (LivePhysRegs.test(Reg) || MRI->isReserved(Reg))
           return false;
@@ -140,8 +140,8 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
       for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
         const MachineOperand &MO = MI->getOperand(i);
         if (MO.isReg() && MO.isDef()) {
-          unsigned Reg = MO.getReg();
-          if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+          Register Reg = MO.getReg();
+          if (Register::isPhysicalRegister(Reg)) {
             // Check the subreg set, not the alias set, because a def
             // of a super-register may still be partially live after
             // this def.
@@ -159,8 +159,8 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
       for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
         const MachineOperand &MO = MI->getOperand(i);
         if (MO.isReg() && MO.isUse()) {
-          unsigned Reg = MO.getReg();
-          if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+          Register Reg = MO.getReg();
+          if (Register::isPhysicalRegister(Reg)) {
             for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
               LivePhysRegs.set(*AI);
           }
diff --git a/lib/CodeGen/DetectDeadLanes.cpp b/lib/CodeGen/DetectDeadLanes.cpp
index fe78acf4d80a..6d5306c1dc0c 100644
--- a/lib/CodeGen/DetectDeadLanes.cpp
+++ b/lib/CodeGen/DetectDeadLanes.cpp
@@ -154,7 +154,7 @@ static bool isCrossCopy(const MachineRegisterInfo &MRI,
                         const TargetRegisterClass *DstRC,
                         const MachineOperand &MO) {
   assert(lowersToCopies(MI));
-  unsigned SrcReg = MO.getReg();
+  Register SrcReg = MO.getReg();
   const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
   if (DstRC == SrcRC)
     return false;
@@ -194,8 +194,8 @@ void DetectDeadLanes::addUsedLanesOnOperand(const MachineOperand &MO,
                                             LaneBitmask UsedLanes) {
   if (!MO.readsReg())
     return;
-  unsigned MOReg = MO.getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(MOReg))
+  Register MOReg = MO.getReg();
+  if (!Register::isVirtualRegister(MOReg))
     return;
 
   unsigned MOSubReg = MO.getSubReg();
@@ -203,7 +203,7 @@ void DetectDeadLanes::addUsedLanesOnOperand(const MachineOperand &MO,
     UsedLanes = TRI->composeSubRegIndexLaneMask(MOSubReg, UsedLanes);
   UsedLanes &= MRI->getMaxLaneMaskForVReg(MOReg);
 
-  unsigned MORegIdx = TargetRegisterInfo::virtReg2Index(MOReg);
+  unsigned MORegIdx = Register::virtReg2Index(MOReg);
   VRegInfo &MORegInfo = VRegInfos[MORegIdx];
   LaneBitmask PrevUsedLanes = MORegInfo.UsedLanes;
   // Any change at all?
@@ -219,7 +219,7 @@ void DetectDeadLanes::addUsedLanesOnOperand(const MachineOperand &MO,
 void DetectDeadLanes::transferUsedLanesStep(const MachineInstr &MI,
                                             LaneBitmask UsedLanes) {
   for (const MachineOperand &MO : MI.uses()) {
-    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
       continue;
     LaneBitmask UsedOnMO = transferUsedLanes(MI, UsedLanes, MO);
     addUsedLanesOnOperand(MO, UsedOnMO);
@@ -230,8 +230,8 @@ LaneBitmask DetectDeadLanes::transferUsedLanes(const MachineInstr &MI,
                                                LaneBitmask UsedLanes,
                                                const MachineOperand &MO) const {
   unsigned OpNum = MI.getOperandNo(&MO);
-  assert(lowersToCopies(MI) && DefinedByCopy[
-           TargetRegisterInfo::virtReg2Index(MI.getOperand(0).getReg())]);
+  assert(lowersToCopies(MI) &&
+         DefinedByCopy[Register::virtReg2Index(MI.getOperand(0).getReg())]);
 
   switch (MI.getOpcode()) {
   case TargetOpcode::COPY:
@@ -250,7 +250,7 @@ LaneBitmask DetectDeadLanes::transferUsedLanes(const MachineInstr &MI,
       return MO2UsedLanes;
 
     const MachineOperand &Def = MI.getOperand(0);
-    unsigned DefReg = Def.getReg();
+    Register DefReg = Def.getReg();
     const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
     LaneBitmask MO1UsedLanes;
     if (RC->CoveredBySubRegs)
@@ -285,10 +285,10 @@ void DetectDeadLanes::transferDefinedLanesStep(const MachineOperand &Use,
   if (MI.getOpcode() == TargetOpcode::PATCHPOINT)
     return;
   const MachineOperand &Def = *MI.defs().begin();
-  unsigned DefReg = Def.getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(DefReg))
+  Register DefReg = Def.getReg();
+  if (!Register::isVirtualRegister(DefReg))
     return;
-  unsigned DefRegIdx = TargetRegisterInfo::virtReg2Index(DefReg);
+  unsigned DefRegIdx = Register::virtReg2Index(DefReg);
   if (!DefinedByCopy.test(DefRegIdx))
     return;
 
@@ -360,7 +360,7 @@ LaneBitmask DetectDeadLanes::determineInitialDefinedLanes(unsigned Reg) {
   if (lowersToCopies(DefMI)) {
     // Start optimisatically with no used or defined lanes for copy
     // instructions. The following dataflow analysis will add more bits.
-    unsigned RegIdx = TargetRegisterInfo::virtReg2Index(Reg);
+    unsigned RegIdx = Register::virtReg2Index(Reg);
     DefinedByCopy.set(RegIdx);
     PutInWorklist(RegIdx);
 
@@ -377,17 +377,17 @@ LaneBitmask DetectDeadLanes::determineInitialDefinedLanes(unsigned Reg) {
     for (const MachineOperand &MO : DefMI.uses()) {
       if (!MO.isReg() || !MO.readsReg())
         continue;
-      unsigned MOReg = MO.getReg();
+      Register MOReg = MO.getReg();
       if (!MOReg)
         continue;
 
       LaneBitmask MODefinedLanes;
-      if (TargetRegisterInfo::isPhysicalRegister(MOReg)) {
+      if (Register::isPhysicalRegister(MOReg)) {
         MODefinedLanes = LaneBitmask::getAll();
       } else if (isCrossCopy(*MRI, DefMI, DefRC, MO)) {
         MODefinedLanes = LaneBitmask::getAll();
       } else {
-        assert(TargetRegisterInfo::isVirtualRegister(MOReg));
+        assert(Register::isVirtualRegister(MOReg));
         if (MRI->hasOneDef(MOReg)) {
           const MachineOperand &MODef = *MRI->def_begin(MOReg);
           const MachineInstr &MODefMI = *MODef.getParent();
@@ -428,10 +428,10 @@ LaneBitmask DetectDeadLanes::determineInitialUsedLanes(unsigned Reg) {
     if (lowersToCopies(UseMI)) {
       assert(UseMI.getDesc().getNumDefs() == 1);
       const MachineOperand &Def = *UseMI.defs().begin();
-      unsigned DefReg = Def.getReg();
+      Register DefReg = Def.getReg();
       // The used lanes of COPY-like instruction operands are determined by the
       // following dataflow analysis.
-      if (TargetRegisterInfo::isVirtualRegister(DefReg)) {
+      if (Register::isVirtualRegister(DefReg)) {
         // But ignore copies across incompatible register classes.
         bool CrossCopy = false;
         if (lowersToCopies(UseMI)) {
@@ -470,10 +470,10 @@ bool DetectDeadLanes::isUndefInput(const MachineOperand &MO,
   if (!lowersToCopies(MI))
     return false;
   const MachineOperand &Def = MI.getOperand(0);
-  unsigned DefReg = Def.getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(DefReg))
+  Register DefReg = Def.getReg();
+  if (!Register::isVirtualRegister(DefReg))
     return false;
-  unsigned DefRegIdx = TargetRegisterInfo::virtReg2Index(DefReg);
+  unsigned DefRegIdx = Register::virtReg2Index(DefReg);
   if (!DefinedByCopy.test(DefRegIdx))
     return false;
 
@@ -482,8 +482,8 @@ bool DetectDeadLanes::isUndefInput(const MachineOperand &MO,
   if (UsedLanes.any())
     return false;
 
-  unsigned MOReg = MO.getReg();
-  if (TargetRegisterInfo::isVirtualRegister(MOReg)) {
+  Register MOReg = MO.getReg();
+  if (Register::isVirtualRegister(MOReg)) {
     const TargetRegisterClass *DstRC = MRI->getRegClass(DefReg);
     *CrossCopy = isCrossCopy(*MRI, MI, DstRC, MO);
   }
@@ -494,7 +494,7 @@ bool DetectDeadLanes::runOnce(MachineFunction &MF) {
   // First pass: Populate defs/uses of vregs with initial values
   unsigned NumVirtRegs = MRI->getNumVirtRegs();
   for (unsigned RegIdx = 0; RegIdx < NumVirtRegs; ++RegIdx) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(RegIdx);
+    unsigned Reg = Register::index2VirtReg(RegIdx);
 
     // Determine used/defined lanes and add copy instructions to worklist.
     VRegInfo &Info = VRegInfos[RegIdx];
@@ -508,7 +508,7 @@ bool DetectDeadLanes::runOnce(MachineFunction &MF) {
     Worklist.pop_front();
     WorklistMembers.reset(RegIdx);
     VRegInfo &Info = VRegInfos[RegIdx];
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(RegIdx);
+    unsigned Reg = Register::index2VirtReg(RegIdx);
 
     // Transfer UsedLanes to operands of DefMI (backwards dataflow).
     MachineOperand &Def = *MRI->def_begin(Reg);
@@ -522,7 +522,7 @@ bool DetectDeadLanes::runOnce(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "Defined/Used lanes:\n"; for (unsigned RegIdx = 0;
                                                      RegIdx < NumVirtRegs;
                                                      ++RegIdx) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(RegIdx);
+    unsigned Reg = Register::index2VirtReg(RegIdx);
     const VRegInfo &Info = VRegInfos[RegIdx];
     dbgs() << printReg(Reg, nullptr)
            << " Used: " << PrintLaneMask(Info.UsedLanes)
@@ -536,10 +536,10 @@ bool DetectDeadLanes::runOnce(MachineFunction &MF) {
       for (MachineOperand &MO : MI.operands()) {
         if (!MO.isReg())
           continue;
-        unsigned Reg = MO.getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        Register Reg = MO.getReg();
+        if (!Register::isVirtualRegister(Reg))
           continue;
-        unsigned RegIdx = TargetRegisterInfo::virtReg2Index(Reg);
+        unsigned RegIdx = Register::virtReg2Index(Reg);
         const VRegInfo &RegInfo = VRegInfos[RegIdx];
         if (MO.isDef() && !MO.isDead() && RegInfo.UsedLanes.none()) {
           LLVM_DEBUG(dbgs()
diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
index 0a83760befaa..e5694218b5c3 100644
--- a/lib/CodeGen/EarlyIfConversion.cpp
+++ b/lib/CodeGen/EarlyIfConversion.cpp
@@ -25,6 +25,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineTraceMetrics.h"
@@ -140,6 +141,18 @@ private:
   /// speculated.
   bool canSpeculateInstrs(MachineBasicBlock *MBB);
 
+  /// Return true if all non-terminator instructions in MBB can be safely
+  /// predicated.
+  bool canPredicateInstrs(MachineBasicBlock *MBB);
+
+  /// Scan through instruction dependencies and update InsertAfter array.
+  /// Return false if any dependency is incompatible with if conversion.
+  bool InstrDependenciesAllowIfConv(MachineInstr *I);
+
+  /// Predicate all instructions of the basic block with current condition
+  /// except for terminators. Reverse the condition if ReversePredicate is set.
+  void PredicateBlock(MachineBasicBlock *MBB, bool ReversePredicate);
+
   /// Find a valid insertion point in Head.
   bool findInsertionPoint();
 
@@ -163,11 +176,14 @@ public:
 
   /// canConvertIf - If the sub-CFG headed by MBB can be if-converted,
   /// initialize the internal state, and return true.
-  bool canConvertIf(MachineBasicBlock *MBB);
+  /// If predicate is set try to predicate the block otherwise try to
+  /// speculatively execute it.
+  bool canConvertIf(MachineBasicBlock *MBB, bool Predicate = false);
 
   /// convertIf - If-convert the last block passed to canConvertIf(), assuming
   /// it is possible. Add any erased blocks to RemovedBlocks.
-  void convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks);
+  void convertIf(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks,
+                 bool Predicate = false);
 };
 } // end anonymous namespace
 
@@ -225,37 +241,112 @@ bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
     }
 
     // Check for any dependencies on Head instructions.
-    for (const MachineOperand &MO : I->operands()) {
-      if (MO.isRegMask()) {
-        LLVM_DEBUG(dbgs() << "Won't speculate regmask: " << *I);
-        return false;
-      }
-      if (!MO.isReg())
-        continue;
-      unsigned Reg = MO.getReg();
+    if (!InstrDependenciesAllowIfConv(&(*I)))
+      return false;
+  }
+  return true;
+}
 
-      // Remember clobbered regunits.
-      if (MO.isDef() && TargetRegisterInfo::isPhysicalRegister(Reg))
-        for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
-          ClobberedRegUnits.set(*Units);
+/// Check that there is no dependencies preventing if conversion.
+///
+/// If instruction uses any values that are defined in the head basic block,
+/// the defining instructions are added to InsertAfter.
+bool SSAIfConv::InstrDependenciesAllowIfConv(MachineInstr *I) {
+  for (const MachineOperand &MO : I->operands()) {
+    if (MO.isRegMask()) {
+      LLVM_DEBUG(dbgs() << "Won't speculate regmask: " << *I);
+      return false;
+    }
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
 
-      if (!MO.readsReg() || !TargetRegisterInfo::isVirtualRegister(Reg))
-        continue;
-      MachineInstr *DefMI = MRI->getVRegDef(Reg);
-      if (!DefMI || DefMI->getParent() != Head)
-        continue;
-      if (InsertAfter.insert(DefMI).second)
-        LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " depends on "
-                          << *DefMI);
-      if (DefMI->isTerminator()) {
-        LLVM_DEBUG(dbgs() << "Can't insert instructions below terminator.\n");
-        return false;
-      }
+    // Remember clobbered regunits.
+    if (MO.isDef() && Register::isPhysicalRegister(Reg))
+      for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+        ClobberedRegUnits.set(*Units);
+
+    if (!MO.readsReg() || !Register::isVirtualRegister(Reg))
+      continue;
+    MachineInstr *DefMI = MRI->getVRegDef(Reg);
+    if (!DefMI || DefMI->getParent() != Head)
+      continue;
+    if (InsertAfter.insert(DefMI).second)
+      LLVM_DEBUG(dbgs() << printMBBReference(*I->getParent()) << " depends on "
+                        << *DefMI);
+    if (DefMI->isTerminator()) {
+      LLVM_DEBUG(dbgs() << "Can't insert instructions below terminator.\n");
+      return false;
     }
   }
   return true;
 }
 
+/// canPredicateInstrs - Returns true if all the instructions in MBB can safely
+/// be predicates. The terminators are not considered.
+///
+/// If instructions use any values that are defined in the head basic block,
+/// the defining instructions are added to InsertAfter.
+///
+/// Any clobbered regunits are added to ClobberedRegUnits.
+///
+bool SSAIfConv::canPredicateInstrs(MachineBasicBlock *MBB) {
+  // Reject any live-in physregs. It's probably CPSR/EFLAGS, and very hard to
+  // get right.
+  if (!MBB->livein_empty()) {
+    LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has live-ins.\n");
+    return false;
+  }
+
+  unsigned InstrCount = 0;
+
+  // Check all instructions, except the terminators. It is assumed that
+  // terminators never have side effects or define any used register values.
+  for (MachineBasicBlock::iterator I = MBB->begin(),
+                                   E = MBB->getFirstTerminator();
+       I != E; ++I) {
+    if (I->isDebugInstr())
+      continue;
+
+    if (++InstrCount > BlockInstrLimit && !Stress) {
+      LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << " has more than "
+                        << BlockInstrLimit << " instructions.\n");
+      return false;
+    }
+
+    // There shouldn't normally be any phis in a single-predecessor block.
+    if (I->isPHI()) {
+      LLVM_DEBUG(dbgs() << "Can't predicate: " << *I);
+      return false;
+    }
+
+    // Check that instruction is predicable and that it is not already
+    // predicated.
+    if (!TII->isPredicable(*I) || TII->isPredicated(*I)) {
+      return false;
+    }
+
+    // Check for any dependencies on Head instructions.
+    if (!InstrDependenciesAllowIfConv(&(*I)))
+      return false;
+  }
+  return true;
+}
+
+// Apply predicate to all instructions in the machine block.
+void SSAIfConv::PredicateBlock(MachineBasicBlock *MBB, bool ReversePredicate) {
+  auto Condition = Cond;
+  if (ReversePredicate)
+    TII->reverseBranchCondition(Condition);
+  // Terminators don't need to be predicated as they will be removed.
+  for (MachineBasicBlock::iterator I = MBB->begin(),
+                                   E = MBB->getFirstTerminator();
+       I != E; ++I) {
+    if (I->isDebugInstr())
+      continue;
+    TII->PredicateInstruction(*I, Condition);
+  }
+}
 
 /// Find an insertion point in Head for the speculated instructions. The
 /// insertion point must be:
@@ -288,8 +379,8 @@ bool SSAIfConv::findInsertionPoint() {
       // We're ignoring regmask operands. That is conservatively correct.
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
-      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      Register Reg = MO.getReg();
+      if (!Register::isPhysicalRegister(Reg))
         continue;
       // I clobbers Reg, so it isn't live before I.
       if (MO.isDef())
@@ -337,7 +428,7 @@ bool SSAIfConv::findInsertionPoint() {
 /// canConvertIf - analyze the sub-cfg rooted in MBB, and return true if it is
 /// a potential candidate for if-conversion. Fill out the internal state.
 ///
-bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB) {
+bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB, bool Predicate) {
   Head = MBB;
   TBB = FBB = Tail = nullptr;
 
@@ -378,8 +469,9 @@ bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB) {
   }
 
   // This is a triangle or a diamond.
-  // If Tail doesn't have any phis, there must be side effects.
-  if (Tail->empty() || !Tail->front().isPHI()) {
+  // Skip if we cannot predicate and there are no phis skip as there must be
+  // side effects that can only be handled with predication.
+  if (!Predicate && (Tail->empty() || !Tail->front().isPHI())) {
     LLVM_DEBUG(dbgs() << "No phis in tail.\n");
     return false;
   }
@@ -423,8 +515,8 @@ bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB) {
       if (PI.PHI->getOperand(i+1).getMBB() == FPred)
         PI.FReg = PI.PHI->getOperand(i).getReg();
     }
-    assert(TargetRegisterInfo::isVirtualRegister(PI.TReg) && "Bad PHI");
-    assert(TargetRegisterInfo::isVirtualRegister(PI.FReg) && "Bad PHI");
+    assert(Register::isVirtualRegister(PI.TReg) && "Bad PHI");
+    assert(Register::isVirtualRegister(PI.FReg) && "Bad PHI");
 
     // Get target information.
     if (!TII->canInsertSelect(*Head, Cond, PI.TReg, PI.FReg,
@@ -437,10 +529,17 @@ bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB) {
   // Check that the conditional instructions can be speculated.
   InsertAfter.clear();
   ClobberedRegUnits.reset();
-  if (TBB != Tail && !canSpeculateInstrs(TBB))
-    return false;
-  if (FBB != Tail && !canSpeculateInstrs(FBB))
-    return false;
+  if (Predicate) {
+    if (TBB != Tail && !canPredicateInstrs(TBB))
+      return false;
+    if (FBB != Tail && !canPredicateInstrs(FBB))
+      return false;
+  } else {
+    if (TBB != Tail && !canSpeculateInstrs(TBB))
+      return false;
+    if (FBB != Tail && !canSpeculateInstrs(FBB))
+      return false;
+  }
 
   // Try to find a valid insertion point for the speculated instructions in the
   // head basic block.
@@ -467,7 +566,7 @@ void SSAIfConv::replacePHIInstrs() {
   for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
     PHIInfo &PI = PHIs[i];
     LLVM_DEBUG(dbgs() << "If-converting " << *PI.PHI);
-    unsigned DstReg = PI.PHI->getOperand(0).getReg();
+    Register DstReg = PI.PHI->getOperand(0).getReg();
     TII->insertSelect(*Head, FirstTerm, HeadDL, DstReg, Cond, PI.TReg, PI.FReg);
     LLVM_DEBUG(dbgs() << "          --> " << *std::prev(FirstTerm));
     PI.PHI->eraseFromParent();
@@ -494,7 +593,7 @@ void SSAIfConv::rewritePHIOperands() {
       // equal.
       DstReg = PI.TReg;
     } else {
-      unsigned PHIDst = PI.PHI->getOperand(0).getReg();
+      Register PHIDst = PI.PHI->getOperand(0).getReg();
       DstReg = MRI->createVirtualRegister(MRI->getRegClass(PHIDst));
       TII->insertSelect(*Head, FirstTerm, HeadDL,
                          DstReg, Cond, PI.TReg, PI.FReg);
@@ -521,7 +620,8 @@ void SSAIfConv::rewritePHIOperands() {
 ///
 /// Any basic blocks erased will be added to RemovedBlocks.
 ///
-void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks) {
+void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks,
+                          bool Predicate) {
   assert(Head && Tail && TBB && FBB && "Call canConvertIf first.");
 
   // Update statistics.
@@ -531,11 +631,16 @@ void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks) {
     ++NumDiamondsConv;
 
   // Move all instructions into Head, except for the terminators.
-  if (TBB != Tail)
+  if (TBB != Tail) {
+    if (Predicate)
+      PredicateBlock(TBB, /*ReversePredicate=*/false);
     Head->splice(InsertionPoint, TBB, TBB->begin(), TBB->getFirstTerminator());
-  if (FBB != Tail)
+  }
+  if (FBB != Tail) {
+    if (Predicate)
+      PredicateBlock(FBB, /*ReversePredicate=*/true);
     Head->splice(InsertionPoint, FBB, FBB->begin(), FBB->getFirstTerminator());
-
+  }
   // Are there extra Tail predecessors?
   bool ExtraPreds = Tail->pred_size() != 2;
   if (ExtraPreds)
@@ -587,7 +692,6 @@ void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks) {
   LLVM_DEBUG(dbgs() << *Head);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                           EarlyIfConverter Pass
 //===----------------------------------------------------------------------===//
@@ -613,8 +717,6 @@ public:
 
 private:
   bool tryConvertIf(MachineBasicBlock*);
-  void updateDomTree(ArrayRef<MachineBasicBlock*> Removed);
-  void updateLoops(ArrayRef<MachineBasicBlock*> Removed);
   void invalidateTraces();
   bool shouldConvertIf();
 };
@@ -642,32 +744,36 @@ void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
+namespace {
 /// Update the dominator tree after if-conversion erased some blocks.
-void EarlyIfConverter::updateDomTree(ArrayRef<MachineBasicBlock*> Removed) {
+void updateDomTree(MachineDominatorTree *DomTree, const SSAIfConv &IfConv,
+                   ArrayRef<MachineBasicBlock *> Removed) {
   // convertIf can remove TBB, FBB, and Tail can be merged into Head.
   // TBB and FBB should not dominate any blocks.
   // Tail children should be transferred to Head.
   MachineDomTreeNode *HeadNode = DomTree->getNode(IfConv.Head);
-  for (unsigned i = 0, e = Removed.size(); i != e; ++i) {
-    MachineDomTreeNode *Node = DomTree->getNode(Removed[i]);
+  for (auto B : Removed) {
+    MachineDomTreeNode *Node = DomTree->getNode(B);
     assert(Node != HeadNode && "Cannot erase the head node");
     while (Node->getNumChildren()) {
       assert(Node->getBlock() == IfConv.Tail && "Unexpected children");
       DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
     }
-    DomTree->eraseNode(Removed[i]);
+    DomTree->eraseNode(B);
   }
 }
 
 /// Update LoopInfo after if-conversion.
-void EarlyIfConverter::updateLoops(ArrayRef<MachineBasicBlock*> Removed) {
+void updateLoops(MachineLoopInfo *Loops,
+                 ArrayRef<MachineBasicBlock *> Removed) {
   if (!Loops)
     return;
   // If-conversion doesn't change loop structure, and it doesn't mess with back
   // edges, so updating LoopInfo is simply removing the dead blocks.
-  for (unsigned i = 0, e = Removed.size(); i != e; ++i)
-    Loops->removeBlock(Removed[i]);
+  for (auto B : Removed)
+    Loops->removeBlock(B);
 }
+} // namespace
 
 /// Invalidate MachineTraceMetrics before if-conversion.
 void EarlyIfConverter::invalidateTraces() {
@@ -783,8 +889,8 @@ bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
     SmallVector<MachineBasicBlock*, 4> RemovedBlocks;
     IfConv.convertIf(RemovedBlocks);
     Changed = true;
-    updateDomTree(RemovedBlocks);
-    updateLoops(RemovedBlocks);
+    updateDomTree(DomTree, IfConv, RemovedBlocks);
+    updateLoops(Loops, RemovedBlocks);
   }
   return Changed;
 }
@@ -822,3 +928,132 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
 
   return Changed;
 }
+
+//===----------------------------------------------------------------------===//
+//                           EarlyIfPredicator Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class EarlyIfPredicator : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  TargetSchedModel SchedModel;
+  MachineRegisterInfo *MRI;
+  MachineDominatorTree *DomTree;
+  MachineLoopInfo *Loops;
+  SSAIfConv IfConv;
+
+public:
+  static char ID;
+  EarlyIfPredicator() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  StringRef getPassName() const override { return "Early If-predicator"; }
+
+protected:
+  bool tryConvertIf(MachineBasicBlock *);
+  bool shouldConvertIf();
+};
+} // end anonymous namespace
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "early-if-predicator"
+
+char EarlyIfPredicator::ID = 0;
+char &llvm::EarlyIfPredicatorID = EarlyIfPredicator::ID;
+
+INITIALIZE_PASS_BEGIN(EarlyIfPredicator, DEBUG_TYPE, "Early If Predicator",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(EarlyIfPredicator, DEBUG_TYPE, "Early If Predicator", false,
+                    false)
+
+void EarlyIfPredicator::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Apply the target heuristic to decide if the transformation is profitable.
+bool EarlyIfPredicator::shouldConvertIf() {
+  if (IfConv.isTriangle()) {
+    MachineBasicBlock &IfBlock =
+        (IfConv.TBB == IfConv.Tail) ? *IfConv.FBB : *IfConv.TBB;
+
+    unsigned ExtraPredCost = 0;
+    unsigned Cycles = 0;
+    for (MachineInstr &I : IfBlock) {
+      unsigned NumCycles = SchedModel.computeInstrLatency(&I, false);
+      if (NumCycles > 1)
+        Cycles += NumCycles - 1;
+      ExtraPredCost += TII->getPredicationCost(I);
+    }
+
+    return TII->isProfitableToIfCvt(IfBlock, Cycles, ExtraPredCost,
+                                    BranchProbability::getUnknown());
+  }
+  unsigned TExtra = 0;
+  unsigned FExtra = 0;
+  unsigned TCycle = 0;
+  unsigned FCycle = 0;
+  for (MachineInstr &I : *IfConv.TBB) {
+    unsigned NumCycles = SchedModel.computeInstrLatency(&I, false);
+    if (NumCycles > 1)
+      TCycle += NumCycles - 1;
+    TExtra += TII->getPredicationCost(I);
+  }
+  for (MachineInstr &I : *IfConv.FBB) {
+    unsigned NumCycles = SchedModel.computeInstrLatency(&I, false);
+    if (NumCycles > 1)
+      FCycle += NumCycles - 1;
+    FExtra += TII->getPredicationCost(I);
+  }
+  return TII->isProfitableToIfCvt(*IfConv.TBB, TCycle, TExtra, *IfConv.FBB,
+                                  FCycle, FExtra,
+                                  BranchProbability::getUnknown());
+}
+
+/// Attempt repeated if-conversion on MBB, return true if successful.
+///
+bool EarlyIfPredicator::tryConvertIf(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  while (IfConv.canConvertIf(MBB, /*Predicate*/ true) && shouldConvertIf()) {
+    // If-convert MBB and update analyses.
+    SmallVector<MachineBasicBlock *, 4> RemovedBlocks;
+    IfConv.convertIf(RemovedBlocks, /*Predicate*/ true);
+    Changed = true;
+    updateDomTree(DomTree, IfConv, RemovedBlocks);
+    updateLoops(Loops, RemovedBlocks);
+  }
+  return Changed;
+}
+
+bool EarlyIfPredicator::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "********** EARLY IF-PREDICATOR **********\n"
+                    << "********** Function: " << MF.getName() << '\n');
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  TII = STI.getInstrInfo();
+  TRI = STI.getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  SchedModel.init(&STI);
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+
+  bool Changed = false;
+  IfConv.runOnMachineFunction(MF);
+
+  // Visit blocks in dominator tree post-order. The post-order enables nested
+  // if-conversion in a single pass. The tryConvertIf() function may erase
+  // blocks, but only blocks dominated by the head block. This makes it safe to
+  // update the dominator tree while the post-order iterator is still active.
+  for (auto DomNode : post_order(DomTree))
+    if (tryConvertIf(DomNode->getBlock()))
+      Changed = true;
+
+  return Changed;
+}
diff --git a/lib/CodeGen/ExecutionDomainFix.cpp b/lib/CodeGen/ExecutionDomainFix.cpp
index a2dd5eee33b7..2cca05ea6f55 100644
--- a/lib/CodeGen/ExecutionDomainFix.cpp
+++ b/lib/CodeGen/ExecutionDomainFix.cpp
@@ -9,6 +9,7 @@
 #include "llvm/CodeGen/ExecutionDomainFix.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
diff --git a/lib/CodeGen/ExpandMemCmp.cpp b/lib/CodeGen/ExpandMemCmp.cpp
index b425482e6adf..9916f2de0414 100644
--- a/lib/CodeGen/ExpandMemCmp.cpp
+++ b/lib/CodeGen/ExpandMemCmp.cpp
@@ -795,7 +795,7 @@ public:
         TPC->getTM<TargetMachine>().getSubtargetImpl(F)->getTargetLowering();
 
     const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     const TargetTransformInfo *TTI =
         &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto PA = runImpl(F, TLI, TTI, TL);
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index 0ab70aff7dc4..1fc57fac1489 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -79,17 +79,17 @@ bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
          (MI->getOperand(2).isReg() && MI->getOperand(2).isUse()) &&
           MI->getOperand(3).isImm() && "Invalid subreg_to_reg");
 
-  unsigned DstReg  = MI->getOperand(0).getReg();
-  unsigned InsReg  = MI->getOperand(2).getReg();
+  Register DstReg = MI->getOperand(0).getReg();
+  Register InsReg = MI->getOperand(2).getReg();
   assert(!MI->getOperand(2).getSubReg() && "SubIdx on physreg?");
   unsigned SubIdx  = MI->getOperand(3).getImm();
 
   assert(SubIdx != 0 && "Invalid index for insert_subreg");
-  unsigned DstSubReg = TRI->getSubReg(DstReg, SubIdx);
+  Register DstSubReg = TRI->getSubReg(DstReg, SubIdx);
 
-  assert(TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+  assert(Register::isPhysicalRegister(DstReg) &&
          "Insert destination must be in a physical register");
-  assert(TargetRegisterInfo::isPhysicalRegister(InsReg) &&
+  assert(Register::isPhysicalRegister(InsReg) &&
          "Inserted value must be in a physical register");
 
   LLVM_DEBUG(dbgs() << "subreg: CONVERTING: " << *MI);
diff --git a/lib/CodeGen/GCMetadata.cpp b/lib/CodeGen/GCMetadata.cpp
index 9c53550eaa9d..c1d22ef89195 100644
--- a/lib/CodeGen/GCMetadata.cpp
+++ b/lib/CodeGen/GCMetadata.cpp
@@ -72,7 +72,7 @@ GCFunctionInfo &GCModuleInfo::getFunctionInfo(const Function &F) {
     return *I->second;
 
   GCStrategy *S = getGCStrategy(F.getGC());
-  Functions.push_back(llvm::make_unique<GCFunctionInfo>(F, *S));
+  Functions.push_back(std::make_unique<GCFunctionInfo>(F, *S));
   GCFunctionInfo *GFI = Functions.back().get();
   FInfoMap[&F] = GFI;
   return *GFI;
diff --git a/lib/CodeGen/GCRootLowering.cpp b/lib/CodeGen/GCRootLowering.cpp
index 90571d090bfb..0dc0a5bce747 100644
--- a/lib/CodeGen/GCRootLowering.cpp
+++ b/lib/CodeGen/GCRootLowering.cpp
@@ -249,7 +249,7 @@ GCMachineCodeAnalysis::GCMachineCodeAnalysis() : MachineFunctionPass(ID) {}
 void GCMachineCodeAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
   AU.setPreservesAll();
-  AU.addRequired<MachineModuleInfo>();
+  AU.addRequired<MachineModuleInfoWrapperPass>();
   AU.addRequired<GCModuleInfo>();
 }
 
@@ -310,7 +310,7 @@ bool GCMachineCodeAnalysis::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   FI = &getAnalysis<GCModuleInfo>().getFunctionInfo(MF.getFunction());
-  MMI = &getAnalysis<MachineModuleInfo>();
+  MMI = &getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
   TII = MF.getSubtarget().getInstrInfo();
 
   // Find the size of the stack frame.  There may be no correct static frame
diff --git a/lib/CodeGen/GlobalISel/CSEInfo.cpp b/lib/CodeGen/GlobalISel/CSEInfo.cpp
index 4518dbee1a9f..7d9d812d34bc 100644
--- a/lib/CodeGen/GlobalISel/CSEInfo.cpp
+++ b/lib/CodeGen/GlobalISel/CSEInfo.cpp
@@ -52,6 +52,7 @@ bool CSEConfigFull::shouldCSEOpc(unsigned Opc) {
   case TargetOpcode::G_ANYEXT:
   case TargetOpcode::G_UNMERGE_VALUES:
   case TargetOpcode::G_TRUNC:
+  case TargetOpcode::G_GEP:
     return true;
   }
   return false;
@@ -65,9 +66,9 @@ std::unique_ptr<CSEConfigBase>
 llvm::getStandardCSEConfigForOpt(CodeGenOpt::Level Level) {
   std::unique_ptr<CSEConfigBase> Config;
   if (Level == CodeGenOpt::None)
-    Config = make_unique<CSEConfigConstantOnly>();
+    Config = std::make_unique<CSEConfigConstantOnly>();
   else
-    Config = make_unique<CSEConfigFull>();
+    Config = std::make_unique<CSEConfigFull>();
   return Config;
 }
 
@@ -332,7 +333,7 @@ GISelInstProfileBuilder::addNodeIDFlag(unsigned Flag) const {
 const GISelInstProfileBuilder &GISelInstProfileBuilder::addNodeIDMachineOperand(
     const MachineOperand &MO) const {
   if (MO.isReg()) {
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!MO.isDef())
       addNodeIDRegNum(Reg);
     LLT Ty = MRI.getType(Reg);
diff --git a/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
index 461bc6038c2c..51a74793f029 100644
--- a/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
+++ b/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
@@ -162,6 +162,17 @@ MachineInstrBuilder CSEMIRBuilder::buildInstr(unsigned Opc,
       return buildConstant(DstOps[0], Cst->getSExtValue());
     break;
   }
+  case TargetOpcode::G_SEXT_INREG: {
+    assert(DstOps.size() == 1 && "Invalid dst ops");
+    assert(SrcOps.size() == 2 && "Invalid src ops");
+    const DstOp &Dst = DstOps[0];
+    const SrcOp &Src0 = SrcOps[0];
+    const SrcOp &Src1 = SrcOps[1];
+    if (auto MaybeCst =
+            ConstantFoldExtOp(Opc, Src0.getReg(), Src1.getImm(), *getMRI()))
+      return buildConstant(Dst, MaybeCst->getSExtValue());
+    break;
+  }
   }
   bool CanCopy = checkCopyToDefsPossible(DstOps);
   if (!canPerformCSEForOpc(Opc))
diff --git a/lib/CodeGen/GlobalISel/CallLowering.cpp b/lib/CodeGen/GlobalISel/CallLowering.cpp
index a5d8205a34a8..cdad92f7db4f 100644
--- a/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -11,14 +11,16 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 
 #define DEBUG_TYPE "call-lowering"
@@ -32,66 +34,70 @@ bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, ImmutableCallSite CS,
                              ArrayRef<ArrayRef<Register>> ArgRegs,
                              Register SwiftErrorVReg,
                              std::function<unsigned()> GetCalleeReg) const {
+  CallLoweringInfo Info;
   auto &DL = CS.getParent()->getParent()->getParent()->getDataLayout();
 
   // First step is to marshall all the function's parameters into the correct
   // physregs and memory locations. Gather the sequence of argument types that
   // we'll pass to the assigner function.
-  SmallVector<ArgInfo, 8> OrigArgs;
   unsigned i = 0;
   unsigned NumFixedArgs = CS.getFunctionType()->getNumParams();
   for (auto &Arg : CS.args()) {
     ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{},
                     i < NumFixedArgs};
     setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CS);
-    // We don't currently support swiftself args.
-    if (OrigArg.Flags.isSwiftSelf())
-      return false;
-    OrigArgs.push_back(OrigArg);
+    Info.OrigArgs.push_back(OrigArg);
     ++i;
   }
 
-  MachineOperand Callee = MachineOperand::CreateImm(0);
   if (const Function *F = CS.getCalledFunction())
-    Callee = MachineOperand::CreateGA(F, 0);
+    Info.Callee = MachineOperand::CreateGA(F, 0);
   else
-    Callee = MachineOperand::CreateReg(GetCalleeReg(), false);
-
-  ArgInfo OrigRet{ResRegs, CS.getType(), ISD::ArgFlagsTy{}};
-  if (!OrigRet.Ty->isVoidTy())
-    setArgFlags(OrigRet, AttributeList::ReturnIndex, DL, CS);
-
-  return lowerCall(MIRBuilder, CS.getCallingConv(), Callee, OrigRet, OrigArgs,
-                   SwiftErrorVReg);
+    Info.Callee = MachineOperand::CreateReg(GetCalleeReg(), false);
+
+  Info.OrigRet = ArgInfo{ResRegs, CS.getType(), ISD::ArgFlagsTy{}};
+  if (!Info.OrigRet.Ty->isVoidTy())
+    setArgFlags(Info.OrigRet, AttributeList::ReturnIndex, DL, CS);
+
+  Info.KnownCallees =
+      CS.getInstruction()->getMetadata(LLVMContext::MD_callees);
+  Info.CallConv = CS.getCallingConv();
+  Info.SwiftErrorVReg = SwiftErrorVReg;
+  Info.IsMustTailCall = CS.isMustTailCall();
+  Info.IsTailCall = CS.isTailCall() &&
+                    isInTailCallPosition(CS, MIRBuilder.getMF().getTarget());
+  Info.IsVarArg = CS.getFunctionType()->isVarArg();
+  return lowerCall(MIRBuilder, Info);
 }
 
 template <typename FuncInfoTy>
 void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,
                                const DataLayout &DL,
                                const FuncInfoTy &FuncInfo) const {
+  auto &Flags = Arg.Flags[0];
   const AttributeList &Attrs = FuncInfo.getAttributes();
   if (Attrs.hasAttribute(OpIdx, Attribute::ZExt))
-    Arg.Flags.setZExt();
+    Flags.setZExt();
   if (Attrs.hasAttribute(OpIdx, Attribute::SExt))
-    Arg.Flags.setSExt();
+    Flags.setSExt();
   if (Attrs.hasAttribute(OpIdx, Attribute::InReg))
-    Arg.Flags.setInReg();
+    Flags.setInReg();
   if (Attrs.hasAttribute(OpIdx, Attribute::StructRet))
-    Arg.Flags.setSRet();
+    Flags.setSRet();
   if (Attrs.hasAttribute(OpIdx, Attribute::SwiftSelf))
-    Arg.Flags.setSwiftSelf();
+    Flags.setSwiftSelf();
   if (Attrs.hasAttribute(OpIdx, Attribute::SwiftError))
-    Arg.Flags.setSwiftError();
+    Flags.setSwiftError();
   if (Attrs.hasAttribute(OpIdx, Attribute::ByVal))
-    Arg.Flags.setByVal();
+    Flags.setByVal();
   if (Attrs.hasAttribute(OpIdx, Attribute::InAlloca))
-    Arg.Flags.setInAlloca();
+    Flags.setInAlloca();
 
-  if (Arg.Flags.isByVal() || Arg.Flags.isInAlloca()) {
+  if (Flags.isByVal() || Flags.isInAlloca()) {
     Type *ElementTy = cast<PointerType>(Arg.Ty)->getElementType();
 
     auto Ty = Attrs.getAttribute(OpIdx, Attribute::ByVal).getValueAsType();
-    Arg.Flags.setByValSize(DL.getTypeAllocSize(Ty ? Ty : ElementTy));
+    Flags.setByValSize(DL.getTypeAllocSize(Ty ? Ty : ElementTy));
 
     // For ByVal, alignment should be passed from FE.  BE will guess if
     // this info is not there but there are cases it cannot get right.
@@ -100,11 +106,11 @@ void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,
       FrameAlign = FuncInfo.getParamAlignment(OpIdx - 2);
     else
       FrameAlign = getTLI()->getByValTypeAlignment(ElementTy, DL);
-    Arg.Flags.setByValAlign(FrameAlign);
+    Flags.setByValAlign(Align(FrameAlign));
   }
   if (Attrs.hasAttribute(OpIdx, Attribute::Nest))
-    Arg.Flags.setNest();
-  Arg.Flags.setOrigAlign(DL.getABITypeAlignment(Arg.Ty));
+    Flags.setNest();
+  Flags.setOrigAlign(Align(DL.getABITypeAlignment(Arg.Ty)));
 }
 
 template void
@@ -159,7 +165,7 @@ void CallLowering::unpackRegs(ArrayRef<Register> DstRegs, Register SrcReg,
 }
 
 bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder,
-                                     ArrayRef<ArgInfo> Args,
+                                     SmallVectorImpl<ArgInfo> &Args,
                                      ValueHandler &Handler) const {
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
@@ -171,7 +177,7 @@ bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder,
 bool CallLowering::handleAssignments(CCState &CCInfo,
                                      SmallVectorImpl<CCValAssign> &ArgLocs,
                                      MachineIRBuilder &MIRBuilder,
-                                     ArrayRef<ArgInfo> Args,
+                                     SmallVectorImpl<ArgInfo> &Args,
                                      ValueHandler &Handler) const {
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = MF.getFunction();
@@ -180,14 +186,99 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
   unsigned NumArgs = Args.size();
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT CurVT = MVT::getVT(Args[i].Ty);
-    if (Handler.assignArg(i, CurVT, CurVT, CCValAssign::Full, Args[i], CCInfo)) {
-      // Try to use the register type if we couldn't assign the VT.
-      if (!Handler.isArgumentHandler() || !CurVT.isValid())
+    if (Handler.assignArg(i, CurVT, CurVT, CCValAssign::Full, Args[i],
+                          Args[i].Flags[0], CCInfo)) {
+      if (!CurVT.isValid())
         return false;
-      CurVT = TLI->getRegisterTypeForCallingConv(
+      MVT NewVT = TLI->getRegisterTypeForCallingConv(
           F.getContext(), F.getCallingConv(), EVT(CurVT));
-      if (Handler.assignArg(i, CurVT, CurVT, CCValAssign::Full, Args[i], CCInfo))
-        return false;
+
+      // If we need to split the type over multiple regs, check it's a scenario
+      // we currently support.
+      unsigned NumParts = TLI->getNumRegistersForCallingConv(
+          F.getContext(), F.getCallingConv(), CurVT);
+      if (NumParts > 1) {
+        // For now only handle exact splits.
+        if (NewVT.getSizeInBits() * NumParts != CurVT.getSizeInBits())
+          return false;
+      }
+
+      // For incoming arguments (physregs to vregs), we could have values in
+      // physregs (or memlocs) which we want to extract and copy to vregs.
+      // During this, we might have to deal with the LLT being split across
+      // multiple regs, so we have to record this information for later.
+      //
+      // If we have outgoing args, then we have the opposite case. We have a
+      // vreg with an LLT which we want to assign to a physical location, and
+      // we might have to record that the value has to be split later.
+      if (Handler.isIncomingArgumentHandler()) {
+        if (NumParts == 1) {
+          // Try to use the register type if we couldn't assign the VT.
+          if (Handler.assignArg(i, NewVT, NewVT, CCValAssign::Full, Args[i],
+                                Args[i].Flags[0], CCInfo))
+            return false;
+        } else {
+          // We're handling an incoming arg which is split over multiple regs.
+          // E.g. passing an s128 on AArch64.
+          ISD::ArgFlagsTy OrigFlags = Args[i].Flags[0];
+          Args[i].OrigRegs.push_back(Args[i].Regs[0]);
+          Args[i].Regs.clear();
+          Args[i].Flags.clear();
+          LLT NewLLT = getLLTForMVT(NewVT);
+          // For each split register, create and assign a vreg that will store
+          // the incoming component of the larger value. These will later be
+          // merged to form the final vreg.
+          for (unsigned Part = 0; Part < NumParts; ++Part) {
+            Register Reg =
+                MIRBuilder.getMRI()->createGenericVirtualRegister(NewLLT);
+            ISD::ArgFlagsTy Flags = OrigFlags;
+            if (Part == 0) {
+              Flags.setSplit();
+            } else {
+              Flags.setOrigAlign(Align::None());
+              if (Part == NumParts - 1)
+                Flags.setSplitEnd();
+            }
+            Args[i].Regs.push_back(Reg);
+            Args[i].Flags.push_back(Flags);
+            if (Handler.assignArg(i + Part, NewVT, NewVT, CCValAssign::Full,
+                                  Args[i], Args[i].Flags[Part], CCInfo)) {
+              // Still couldn't assign this smaller part type for some reason.
+              return false;
+            }
+          }
+        }
+      } else {
+        // Handling an outgoing arg that might need to be split.
+        if (NumParts < 2)
+          return false; // Don't know how to deal with this type combination.
+
+        // This type is passed via multiple registers in the calling convention.
+        // We need to extract the individual parts.
+        Register LargeReg = Args[i].Regs[0];
+        LLT SmallTy = LLT::scalar(NewVT.getSizeInBits());
+        auto Unmerge = MIRBuilder.buildUnmerge(SmallTy, LargeReg);
+        assert(Unmerge->getNumOperands() == NumParts + 1);
+        ISD::ArgFlagsTy OrigFlags = Args[i].Flags[0];
+        // We're going to replace the regs and flags with the split ones.
+        Args[i].Regs.clear();
+        Args[i].Flags.clear();
+        for (unsigned PartIdx = 0; PartIdx < NumParts; ++PartIdx) {
+          ISD::ArgFlagsTy Flags = OrigFlags;
+          if (PartIdx == 0) {
+            Flags.setSplit();
+          } else {
+            Flags.setOrigAlign(Align::None());
+            if (PartIdx == NumParts - 1)
+              Flags.setSplitEnd();
+          }
+          Args[i].Regs.push_back(Unmerge.getReg(PartIdx));
+          Args[i].Flags.push_back(Flags);
+          if (Handler.assignArg(i + PartIdx, NewVT, NewVT, CCValAssign::Full,
+                                Args[i], Args[i].Flags[PartIdx], CCInfo))
+            return false;
+        }
+      }
     }
   }
 
@@ -202,18 +293,32 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
       continue;
     }
 
-    assert(Args[i].Regs.size() == 1 &&
-           "Can't handle multiple virtual regs yet");
-
     // FIXME: Pack registers if we have more than one.
     Register ArgReg = Args[i].Regs[0];
 
+    MVT OrigVT = MVT::getVT(Args[i].Ty);
+    MVT VAVT = VA.getValVT();
     if (VA.isRegLoc()) {
-      MVT OrigVT = MVT::getVT(Args[i].Ty);
-      MVT VAVT = VA.getValVT();
-      if (Handler.isArgumentHandler() && VAVT != OrigVT) {
-        if (VAVT.getSizeInBits() < OrigVT.getSizeInBits())
-          return false; // Can't handle this type of arg yet.
+      if (Handler.isIncomingArgumentHandler() && VAVT != OrigVT) {
+        if (VAVT.getSizeInBits() < OrigVT.getSizeInBits()) {
+          // Expected to be multiple regs for a single incoming arg.
+          unsigned NumArgRegs = Args[i].Regs.size();
+          if (NumArgRegs < 2)
+            return false;
+
+          assert((j + (NumArgRegs - 1)) < ArgLocs.size() &&
+                 "Too many regs for number of args");
+          for (unsigned Part = 0; Part < NumArgRegs; ++Part) {
+            // There should be Regs.size() ArgLocs per argument.
+            VA = ArgLocs[j + Part];
+            Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA);
+          }
+          j += NumArgRegs - 1;
+          // Merge the split registers into the expected larger result vreg
+          // of the original call.
+          MIRBuilder.buildMerge(Args[i].OrigRegs[0], Args[i].Regs);
+          continue;
+        }
         const LLT VATy(VAVT);
         Register NewReg =
             MIRBuilder.getMRI()->createGenericVirtualRegister(VATy);
@@ -234,10 +339,28 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
         } else {
           MIRBuilder.buildTrunc(ArgReg, {NewReg}).getReg(0);
         }
+      } else if (!Handler.isIncomingArgumentHandler()) {
+        assert((j + (Args[i].Regs.size() - 1)) < ArgLocs.size() &&
+               "Too many regs for number of args");
+        // This is an outgoing argument that might have been split.
+        for (unsigned Part = 0; Part < Args[i].Regs.size(); ++Part) {
+          // There should be Regs.size() ArgLocs per argument.
+          VA = ArgLocs[j + Part];
+          Handler.assignValueToReg(Args[i].Regs[Part], VA.getLocReg(), VA);
+        }
+        j += Args[i].Regs.size() - 1;
       } else {
         Handler.assignValueToReg(ArgReg, VA.getLocReg(), VA);
       }
     } else if (VA.isMemLoc()) {
+      // Don't currently support loading/storing a type that needs to be split
+      // to the stack. Should be easy, just not implemented yet.
+      if (Args[i].Regs.size() > 1) {
+        LLVM_DEBUG(
+            dbgs()
+            << "Load/store a split arg to/from the stack not implemented yet");
+        return false;
+      }
       MVT VT = MVT::getVT(Args[i].Ty);
       unsigned Size = VT == MVT::iPTR ? DL.getPointerSize()
                                       : alignTo(VT.getSizeInBits(), 8) / 8;
@@ -253,6 +376,81 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
   return true;
 }
 
+bool CallLowering::analyzeArgInfo(CCState &CCState,
+                                  SmallVectorImpl<ArgInfo> &Args,
+                                  CCAssignFn &AssignFnFixed,
+                                  CCAssignFn &AssignFnVarArg) const {
+  for (unsigned i = 0, e = Args.size(); i < e; ++i) {
+    MVT VT = MVT::getVT(Args[i].Ty);
+    CCAssignFn &Fn = Args[i].IsFixed ? AssignFnFixed : AssignFnVarArg;
+    if (Fn(i, VT, VT, CCValAssign::Full, Args[i].Flags[0], CCState)) {
+      // Bail out on anything we can't handle.
+      LLVM_DEBUG(dbgs() << "Cannot analyze " << EVT(VT).getEVTString()
+                        << " (arg number = " << i << "\n");
+      return false;
+    }
+  }
+  return true;
+}
+
+bool CallLowering::resultsCompatible(CallLoweringInfo &Info,
+                                     MachineFunction &MF,
+                                     SmallVectorImpl<ArgInfo> &InArgs,
+                                     CCAssignFn &CalleeAssignFnFixed,
+                                     CCAssignFn &CalleeAssignFnVarArg,
+                                     CCAssignFn &CallerAssignFnFixed,
+                                     CCAssignFn &CallerAssignFnVarArg) const {
+  const Function &F = MF.getFunction();
+  CallingConv::ID CalleeCC = Info.CallConv;
+  CallingConv::ID CallerCC = F.getCallingConv();
+
+  if (CallerCC == CalleeCC)
+    return true;
+
+  SmallVector<CCValAssign, 16> ArgLocs1;
+  CCState CCInfo1(CalleeCC, false, MF, ArgLocs1, F.getContext());
+  if (!analyzeArgInfo(CCInfo1, InArgs, CalleeAssignFnFixed,
+                      CalleeAssignFnVarArg))
+    return false;
+
+  SmallVector<CCValAssign, 16> ArgLocs2;
+  CCState CCInfo2(CallerCC, false, MF, ArgLocs2, F.getContext());
+  if (!analyzeArgInfo(CCInfo2, InArgs, CallerAssignFnFixed,
+                      CalleeAssignFnVarArg))
+    return false;
+
+  // We need the argument locations to match up exactly. If there's more in
+  // one than the other, then we are done.
+  if (ArgLocs1.size() != ArgLocs2.size())
+    return false;
+
+  // Make sure that each location is passed in exactly the same way.
+  for (unsigned i = 0, e = ArgLocs1.size(); i < e; ++i) {
+    const CCValAssign &Loc1 = ArgLocs1[i];
+    const CCValAssign &Loc2 = ArgLocs2[i];
+
+    // We need both of them to be the same. So if one is a register and one
+    // isn't, we're done.
+    if (Loc1.isRegLoc() != Loc2.isRegLoc())
+      return false;
+
+    if (Loc1.isRegLoc()) {
+      // If they don't have the same register location, we're done.
+      if (Loc1.getLocReg() != Loc2.getLocReg())
+        return false;
+
+      // They matched, so we can move to the next ArgLoc.
+      continue;
+    }
+
+    // Loc1 wasn't a RegLoc, so they both must be MemLocs. Check if they match.
+    if (Loc1.getLocMemOffset() != Loc2.getLocMemOffset())
+      return false;
+  }
+
+  return true;
+}
+
 Register CallLowering::ValueHandler::extendRegister(Register ValReg,
                                                     CCValAssign &VA) {
   LLT LocTy{VA.getLocVT()};
diff --git a/lib/CodeGen/GlobalISel/Combiner.cpp b/lib/CodeGen/GlobalISel/Combiner.cpp
index 31cb1dbbc9b5..b4562a5c6601 100644
--- a/lib/CodeGen/GlobalISel/Combiner.cpp
+++ b/lib/CodeGen/GlobalISel/Combiner.cpp
@@ -27,6 +27,18 @@
 
 using namespace llvm;
 
+namespace llvm {
+cl::OptionCategory GICombinerOptionCategory(
+    "GlobalISel Combiner",
+    "Control the rules which are enabled. These options all take a comma "
+    "separated list of rules to disable and may be specified by number "
+    "or number range (e.g. 1-10)."
+#ifndef NDEBUG
+    " They may also be specified by name."
+#endif
+);
+} // end namespace llvm
+
 namespace {
 /// This class acts as the glue the joins the CombinerHelper to the overall
 /// Combine algorithm. The CombinerHelper is intended to report the
@@ -92,7 +104,7 @@ bool Combiner::combineMachineInstrs(MachineFunction &MF,
     return false;
 
   Builder =
-      CSEInfo ? make_unique<CSEMIRBuilder>() : make_unique<MachineIRBuilder>();
+      CSEInfo ? std::make_unique<CSEMIRBuilder>() : std::make_unique<MachineIRBuilder>();
   MRI = &MF.getRegInfo();
   Builder->setMF(MF);
   if (CSEInfo)
diff --git a/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 9cbf3dd83ff1..854769d283f7 100644
--- a/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -8,19 +8,36 @@
 #include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "gi-combiner"
 
 using namespace llvm;
 
+// Option to allow testing of the combiner while no targets know about indexed
+// addressing.
+static cl::opt<bool>
+    ForceLegalIndexing("force-legal-indexing", cl::Hidden, cl::init(false),
+                       cl::desc("Force all indexed operations to be "
+                                "legal for the GlobalISel combiner"));
+
+
 CombinerHelper::CombinerHelper(GISelChangeObserver &Observer,
-                               MachineIRBuilder &B)
-    : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer) {}
+                               MachineIRBuilder &B, GISelKnownBits *KB,
+                               MachineDominatorTree *MDT)
+    : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer),
+      KB(KB), MDT(MDT) {
+  (void)this->KB;
+}
 
 void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
                                     Register ToReg) const {
@@ -55,8 +72,8 @@ bool CombinerHelper::tryCombineCopy(MachineInstr &MI) {
 bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
   if (MI.getOpcode() != TargetOpcode::COPY)
     return false;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(1).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
   LLT DstTy = MRI.getType(DstReg);
   LLT SrcTy = MRI.getType(SrcReg);
   // Simple Copy Propagation.
@@ -66,12 +83,183 @@ bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
   return false;
 }
 void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(1).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
   MI.eraseFromParent();
   replaceRegWith(MRI, DstReg, SrcReg);
 }
 
+bool CombinerHelper::tryCombineConcatVectors(MachineInstr &MI) {
+  bool IsUndef = false;
+  SmallVector<Register, 4> Ops;
+  if (matchCombineConcatVectors(MI, IsUndef, Ops)) {
+    applyCombineConcatVectors(MI, IsUndef, Ops);
+    return true;
+  }
+  return false;
+}
+
+bool CombinerHelper::matchCombineConcatVectors(MachineInstr &MI, bool &IsUndef,
+                                               SmallVectorImpl<Register> &Ops) {
+  assert(MI.getOpcode() == TargetOpcode::G_CONCAT_VECTORS &&
+         "Invalid instruction");
+  IsUndef = true;
+  MachineInstr *Undef = nullptr;
+
+  // Walk over all the operands of concat vectors and check if they are
+  // build_vector themselves or undef.
+  // Then collect their operands in Ops.
+  for (const MachineOperand &MO : MI.operands()) {
+    // Skip the instruction definition.
+    if (MO.isDef())
+      continue;
+    Register Reg = MO.getReg();
+    MachineInstr *Def = MRI.getVRegDef(Reg);
+    assert(Def && "Operand not defined");
+    switch (Def->getOpcode()) {
+    case TargetOpcode::G_BUILD_VECTOR:
+      IsUndef = false;
+      // Remember the operands of the build_vector to fold
+      // them into the yet-to-build flattened concat vectors.
+      for (const MachineOperand &BuildVecMO : Def->operands()) {
+        // Skip the definition.
+        if (BuildVecMO.isDef())
+          continue;
+        Ops.push_back(BuildVecMO.getReg());
+      }
+      break;
+    case TargetOpcode::G_IMPLICIT_DEF: {
+      LLT OpType = MRI.getType(Reg);
+      // Keep one undef value for all the undef operands.
+      if (!Undef) {
+        Builder.setInsertPt(*MI.getParent(), MI);
+        Undef = Builder.buildUndef(OpType.getScalarType());
+      }
+      assert(MRI.getType(Undef->getOperand(0).getReg()) ==
+                 OpType.getScalarType() &&
+             "All undefs should have the same type");
+      // Break the undef vector in as many scalar elements as needed
+      // for the flattening.
+      for (unsigned EltIdx = 0, EltEnd = OpType.getNumElements();
+           EltIdx != EltEnd; ++EltIdx)
+        Ops.push_back(Undef->getOperand(0).getReg());
+      break;
+    }
+    default:
+      return false;
+    }
+  }
+  return true;
+}
+void CombinerHelper::applyCombineConcatVectors(
+    MachineInstr &MI, bool IsUndef, const ArrayRef<Register> Ops) {
+  // We determined that the concat_vectors can be flatten.
+  // Generate the flattened build_vector.
+  Register DstReg = MI.getOperand(0).getReg();
+  Builder.setInsertPt(*MI.getParent(), MI);
+  Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
+
+  // Note: IsUndef is sort of redundant. We could have determine it by
+  // checking that at all Ops are undef.  Alternatively, we could have
+  // generate a build_vector of undefs and rely on another combine to
+  // clean that up.  For now, given we already gather this information
+  // in tryCombineConcatVectors, just save compile time and issue the
+  // right thing.
+  if (IsUndef)
+    Builder.buildUndef(NewDstReg);
+  else
+    Builder.buildBuildVector(NewDstReg, Ops);
+  MI.eraseFromParent();
+  replaceRegWith(MRI, DstReg, NewDstReg);
+}
+
+bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) {
+  SmallVector<Register, 4> Ops;
+  if (matchCombineShuffleVector(MI, Ops)) {
+    applyCombineShuffleVector(MI, Ops);
+    return true;
+  }
+  return false;
+}
+
+bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI,
+                                               SmallVectorImpl<Register> &Ops) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
+         "Invalid instruction kind");
+  LLT DstType = MRI.getType(MI.getOperand(0).getReg());
+  Register Src1 = MI.getOperand(1).getReg();
+  LLT SrcType = MRI.getType(Src1);
+  unsigned DstNumElts = DstType.getNumElements();
+  unsigned SrcNumElts = SrcType.getNumElements();
+
+  // If the resulting vector is smaller than the size of the source
+  // vectors being concatenated, we won't be able to replace the
+  // shuffle vector into a concat_vectors.
+  //
+  // Note: We may still be able to produce a concat_vectors fed by
+  //       extract_vector_elt and so on. It is less clear that would
+  //       be better though, so don't bother for now.
+  if (DstNumElts < 2 * SrcNumElts)
+    return false;
+
+  // Check that the shuffle mask can be broken evenly between the
+  // different sources.
+  if (DstNumElts % SrcNumElts != 0)
+    return false;
+
+  // Mask length is a multiple of the source vector length.
+  // Check if the shuffle is some kind of concatenation of the input
+  // vectors.
+  unsigned NumConcat = DstNumElts / SrcNumElts;
+  SmallVector<int, 8> ConcatSrcs(NumConcat, -1);
+  SmallVector<int, 8> Mask;
+  ShuffleVectorInst::getShuffleMask(MI.getOperand(3).getShuffleMask(), Mask);
+  for (unsigned i = 0; i != DstNumElts; ++i) {
+    int Idx = Mask[i];
+    // Undef value.
+    if (Idx < 0)
+      continue;
+    // Ensure the indices in each SrcType sized piece are sequential and that
+    // the same source is used for the whole piece.
+    if ((Idx % SrcNumElts != (i % SrcNumElts)) ||
+        (ConcatSrcs[i / SrcNumElts] >= 0 &&
+         ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts)))
+      return false;
+    // Remember which source this index came from.
+    ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts;
+  }
+
+  // The shuffle is concatenating multiple vectors together.
+  // Collect the different operands for that.
+  Register UndefReg;
+  Register Src2 = MI.getOperand(2).getReg();
+  for (auto Src : ConcatSrcs) {
+    if (Src < 0) {
+      if (!UndefReg) {
+        Builder.setInsertPt(*MI.getParent(), MI);
+        UndefReg = Builder.buildUndef(SrcType).getReg(0);
+      }
+      Ops.push_back(UndefReg);
+    } else if (Src == 0)
+      Ops.push_back(Src1);
+    else
+      Ops.push_back(Src2);
+  }
+  return true;
+}
+
+void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI,
+                                               const ArrayRef<Register> Ops) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Builder.setInsertPt(*MI.getParent(), MI);
+  Register NewDstReg = MRI.cloneVirtualRegister(DstReg);
+
+  Builder.buildConcatVectors(NewDstReg, Ops);
+
+  MI.eraseFromParent();
+  replaceRegWith(MRI, DstReg, NewDstReg);
+}
+
 namespace {
 
 /// Select a preference between two uses. CurrentUse is the current preference
@@ -279,7 +467,7 @@ void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI,
     // up the type and extend so that it uses the preferred use.
     if (UseMI->getOpcode() == Preferred.ExtendOpcode ||
         UseMI->getOpcode() == TargetOpcode::G_ANYEXT) {
-      unsigned UseDstReg = UseMI->getOperand(0).getReg();
+      Register UseDstReg = UseMI->getOperand(0).getReg();
       MachineOperand &UseSrcMO = UseMI->getOperand(1);
       const LLT &UseDstTy = MRI.getType(UseDstReg);
       if (UseDstReg != ChosenDstReg) {
@@ -342,8 +530,212 @@ void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI,
   Observer.changedInstr(MI);
 }
 
-bool CombinerHelper::matchCombineBr(MachineInstr &MI) {
-  assert(MI.getOpcode() == TargetOpcode::G_BR && "Expected a G_BR");
+bool CombinerHelper::isPredecessor(MachineInstr &DefMI, MachineInstr &UseMI) {
+  assert(DefMI.getParent() == UseMI.getParent());
+  if (&DefMI == &UseMI)
+    return false;
+
+  // Loop through the basic block until we find one of the instructions.
+  MachineBasicBlock::const_iterator I = DefMI.getParent()->begin();
+  for (; &*I != &DefMI && &*I != &UseMI; ++I)
+    return &*I == &DefMI;
+
+  llvm_unreachable("Block must contain instructions");
+}
+
+bool CombinerHelper::dominates(MachineInstr &DefMI, MachineInstr &UseMI) {
+  if (MDT)
+    return MDT->dominates(&DefMI, &UseMI);
+  else if (DefMI.getParent() != UseMI.getParent())
+    return false;
+
+  return isPredecessor(DefMI, UseMI);
+}
+
+bool CombinerHelper::findPostIndexCandidate(MachineInstr &MI, Register &Addr,
+                                            Register &Base, Register &Offset) {
+  auto &MF = *MI.getParent()->getParent();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+
+#ifndef NDEBUG
+  unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_LOAD || Opcode == TargetOpcode::G_SEXTLOAD ||
+         Opcode == TargetOpcode::G_ZEXTLOAD || Opcode == TargetOpcode::G_STORE);
+#endif
+
+  Base = MI.getOperand(1).getReg();
+  MachineInstr *BaseDef = MRI.getUniqueVRegDef(Base);
+  if (BaseDef && BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Searching for post-indexing opportunity for: " << MI);
+
+  for (auto &Use : MRI.use_instructions(Base)) {
+    if (Use.getOpcode() != TargetOpcode::G_GEP)
+      continue;
+
+    Offset = Use.getOperand(2).getReg();
+    if (!ForceLegalIndexing &&
+        !TLI.isIndexingLegal(MI, Base, Offset, /*IsPre*/ false, MRI)) {
+      LLVM_DEBUG(dbgs() << "    Ignoring candidate with illegal addrmode: "
+                        << Use);
+      continue;
+    }
+
+    // Make sure the offset calculation is before the potentially indexed op.
+    // FIXME: we really care about dependency here. The offset calculation might
+    // be movable.
+    MachineInstr *OffsetDef = MRI.getUniqueVRegDef(Offset);
+    if (!OffsetDef || !dominates(*OffsetDef, MI)) {
+      LLVM_DEBUG(dbgs() << "    Ignoring candidate with offset after mem-op: "
+                        << Use);
+      continue;
+    }
+
+    // FIXME: check whether all uses of Base are load/store with foldable
+    // addressing modes. If so, using the normal addr-modes is better than
+    // forming an indexed one.
+
+    bool MemOpDominatesAddrUses = true;
+    for (auto &GEPUse : MRI.use_instructions(Use.getOperand(0).getReg())) {
+      if (!dominates(MI, GEPUse)) {
+        MemOpDominatesAddrUses = false;
+        break;
+      }
+    }
+
+    if (!MemOpDominatesAddrUses) {
+      LLVM_DEBUG(
+          dbgs() << "    Ignoring candidate as memop does not dominate uses: "
+                 << Use);
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "    Found match: " << Use);
+    Addr = Use.getOperand(0).getReg();
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::findPreIndexCandidate(MachineInstr &MI, Register &Addr,
+                                           Register &Base, Register &Offset) {
+  auto &MF = *MI.getParent()->getParent();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+
+#ifndef NDEBUG
+  unsigned Opcode = MI.getOpcode();
+  assert(Opcode == TargetOpcode::G_LOAD || Opcode == TargetOpcode::G_SEXTLOAD ||
+         Opcode == TargetOpcode::G_ZEXTLOAD || Opcode == TargetOpcode::G_STORE);
+#endif
+
+  Addr = MI.getOperand(1).getReg();
+  MachineInstr *AddrDef = getOpcodeDef(TargetOpcode::G_GEP, Addr, MRI);
+  if (!AddrDef || MRI.hasOneUse(Addr))
+    return false;
+
+  Base = AddrDef->getOperand(1).getReg();
+  Offset = AddrDef->getOperand(2).getReg();
+
+  LLVM_DEBUG(dbgs() << "Found potential pre-indexed load_store: " << MI);
+
+  if (!ForceLegalIndexing &&
+      !TLI.isIndexingLegal(MI, Base, Offset, /*IsPre*/ true, MRI)) {
+    LLVM_DEBUG(dbgs() << "    Skipping, not legal for target");
+    return false;
+  }
+
+  MachineInstr *BaseDef = getDefIgnoringCopies(Base, MRI);
+  if (BaseDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
+    LLVM_DEBUG(dbgs() << "    Skipping, frame index would need copy anyway.");
+    return false;
+  }
+
+  if (MI.getOpcode() == TargetOpcode::G_STORE) {
+    // Would require a copy.
+    if (Base == MI.getOperand(0).getReg()) {
+      LLVM_DEBUG(dbgs() << "    Skipping, storing base so need copy anyway.");
+      return false;
+    }
+
+    // We're expecting one use of Addr in MI, but it could also be the
+    // value stored, which isn't actually dominated by the instruction.
+    if (MI.getOperand(0).getReg() == Addr) {
+      LLVM_DEBUG(dbgs() << "    Skipping, does not dominate all addr uses");
+      return false;
+    }
+  }
+
+  // FIXME: check whether all uses of the base pointer are constant GEPs. That
+  // might allow us to end base's liveness here by adjusting the constant.
+
+  for (auto &UseMI : MRI.use_instructions(Addr)) {
+    if (!dominates(MI, UseMI)) {
+      LLVM_DEBUG(dbgs() << "    Skipping, does not dominate all addr uses.");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool CombinerHelper::tryCombineIndexedLoadStore(MachineInstr &MI) {
+  unsigned Opcode = MI.getOpcode();
+  if (Opcode != TargetOpcode::G_LOAD && Opcode != TargetOpcode::G_SEXTLOAD &&
+      Opcode != TargetOpcode::G_ZEXTLOAD && Opcode != TargetOpcode::G_STORE)
+    return false;
+
+  bool IsStore = Opcode == TargetOpcode::G_STORE;
+  Register Addr, Base, Offset;
+  bool IsPre = findPreIndexCandidate(MI, Addr, Base, Offset);
+  if (!IsPre && !findPostIndexCandidate(MI, Addr, Base, Offset))
+    return false;
+
+
+  unsigned NewOpcode;
+  switch (Opcode) {
+  case TargetOpcode::G_LOAD:
+    NewOpcode = TargetOpcode::G_INDEXED_LOAD;
+    break;
+  case TargetOpcode::G_SEXTLOAD:
+    NewOpcode = TargetOpcode::G_INDEXED_SEXTLOAD;
+    break;
+  case TargetOpcode::G_ZEXTLOAD:
+    NewOpcode = TargetOpcode::G_INDEXED_ZEXTLOAD;
+    break;
+  case TargetOpcode::G_STORE:
+    NewOpcode = TargetOpcode::G_INDEXED_STORE;
+    break;
+  default:
+    llvm_unreachable("Unknown load/store opcode");
+  }
+
+  MachineInstr &AddrDef = *MRI.getUniqueVRegDef(Addr);
+  MachineIRBuilder MIRBuilder(MI);
+  auto MIB = MIRBuilder.buildInstr(NewOpcode);
+  if (IsStore) {
+    MIB.addDef(Addr);
+    MIB.addUse(MI.getOperand(0).getReg());
+  } else {
+    MIB.addDef(MI.getOperand(0).getReg());
+    MIB.addDef(Addr);
+  }
+
+  MIB.addUse(Base);
+  MIB.addUse(Offset);
+  MIB.addImm(IsPre);
+  MI.eraseFromParent();
+  AddrDef.eraseFromParent();
+
+  LLVM_DEBUG(dbgs() << "    Combinined to indexed operation");
+  return true;
+}
+
+bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) {
+  if (MI.getOpcode() != TargetOpcode::G_BR)
+    return false;
+
   // Try to match the following:
   // bb1:
   //   %c(s32) = G_ICMP pred, %a, %b
@@ -380,9 +772,14 @@ bool CombinerHelper::matchCombineBr(MachineInstr &MI) {
   return true;
 }
 
-bool CombinerHelper::tryCombineBr(MachineInstr &MI) {
-  if (!matchCombineBr(MI))
+bool CombinerHelper::tryElideBrByInvertingCond(MachineInstr &MI) {
+  if (!matchElideBrByInvertingCond(MI))
     return false;
+  applyElideBrByInvertingCond(MI);
+  return true;
+}
+
+void CombinerHelper::applyElideBrByInvertingCond(MachineInstr &MI) {
   MachineBasicBlock *BrTarget = MI.getOperand(0).getMBB();
   MachineBasicBlock::iterator BrIt(MI);
   MachineInstr *BrCond = &*std::prev(BrIt);
@@ -401,11 +798,509 @@ bool CombinerHelper::tryCombineBr(MachineInstr &MI) {
   BrCond->getOperand(1).setMBB(BrTarget);
   Observer.changedInstr(*BrCond);
   MI.eraseFromParent();
+}
+
+static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
+  // On Darwin, -Os means optimize for size without hurting performance, so
+  // only really optimize for size when -Oz (MinSize) is used.
+  if (MF.getTarget().getTargetTriple().isOSDarwin())
+    return MF.getFunction().hasMinSize();
+  return MF.getFunction().hasOptSize();
+}
+
+// Returns a list of types to use for memory op lowering in MemOps. A partial
+// port of findOptimalMemOpLowering in TargetLowering.
+static bool findGISelOptimalMemOpLowering(
+    std::vector<LLT> &MemOps, unsigned Limit, uint64_t Size, unsigned DstAlign,
+    unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+    bool AllowOverlap, unsigned DstAS, unsigned SrcAS,
+    const AttributeList &FuncAttributes, const TargetLowering &TLI) {
+  // If 'SrcAlign' is zero, that means the memory operation does not need to
+  // load the value, i.e. memset or memcpy from constant string. Otherwise,
+  // it's the inferred alignment of the source. 'DstAlign', on the other hand,
+  // is the specified alignment of the memory operation. If it is zero, that
+  // means it's possible to change the alignment of the destination.
+  // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
+  // not need to be loaded.
+  if (SrcAlign != 0 && SrcAlign < DstAlign)
+    return false;
+
+  LLT Ty = TLI.getOptimalMemOpLLT(Size, DstAlign, SrcAlign, IsMemset,
+                                  ZeroMemset, MemcpyStrSrc, FuncAttributes);
+
+  if (Ty == LLT()) {
+    // Use the largest scalar type whose alignment constraints are satisfied.
+    // We only need to check DstAlign here as SrcAlign is always greater or
+    // equal to DstAlign (or zero).
+    Ty = LLT::scalar(64);
+    while (DstAlign && DstAlign < Ty.getSizeInBytes() &&
+           !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, DstAlign))
+      Ty = LLT::scalar(Ty.getSizeInBytes());
+    assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
+    // FIXME: check for the largest legal type we can load/store to.
+  }
+
+  unsigned NumMemOps = 0;
+  while (Size != 0) {
+    unsigned TySize = Ty.getSizeInBytes();
+    while (TySize > Size) {
+      // For now, only use non-vector load / store's for the left-over pieces.
+      LLT NewTy = Ty;
+      // FIXME: check for mem op safety and legality of the types. Not all of
+      // SDAGisms map cleanly to GISel concepts.
+      if (NewTy.isVector())
+        NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32);
+      NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits() - 1));
+      unsigned NewTySize = NewTy.getSizeInBytes();
+      assert(NewTySize > 0 && "Could not find appropriate type");
+
+      // If the new LLT cannot cover all of the remaining bits, then consider
+      // issuing a (or a pair of) unaligned and overlapping load / store.
+      bool Fast;
+      // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
+      MVT VT = getMVTForLLT(Ty);
+      if (NumMemOps && AllowOverlap && NewTySize < Size &&
+          TLI.allowsMisalignedMemoryAccesses(
+              VT, DstAS, DstAlign, MachineMemOperand::MONone, &Fast) &&
+          Fast)
+        TySize = Size;
+      else {
+        Ty = NewTy;
+        TySize = NewTySize;
+      }
+    }
+
+    if (++NumMemOps > Limit)
+      return false;
+
+    MemOps.push_back(Ty);
+    Size -= TySize;
+  }
+
+  return true;
+}
+
+static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
+  if (Ty.isVector())
+    return VectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
+                           Ty.getNumElements());
+  return IntegerType::get(C, Ty.getSizeInBits());
+}
+
+// Get a vectorized representation of the memset value operand, GISel edition.
+static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+  unsigned NumBits = Ty.getScalarSizeInBits();
+  auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
+  if (!Ty.isVector() && ValVRegAndVal) {
+    unsigned KnownVal = ValVRegAndVal->Value;
+    APInt Scalar = APInt(8, KnownVal);
+    APInt SplatVal = APInt::getSplat(NumBits, Scalar);
+    return MIB.buildConstant(Ty, SplatVal).getReg(0);
+  }
+  // FIXME: for vector types create a G_BUILD_VECTOR.
+  if (Ty.isVector())
+    return Register();
+
+  // Extend the byte value to the larger type, and then multiply by a magic
+  // value 0x010101... in order to replicate it across every byte.
+  LLT ExtType = Ty.getScalarType();
+  auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
+  if (NumBits > 8) {
+    APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
+    auto MagicMI = MIB.buildConstant(ExtType, Magic);
+    Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
+  }
+
+  assert(ExtType == Ty && "Vector memset value type not supported yet");
+  return Val;
+}
+
+bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst, Register Val,
+                                    unsigned KnownLen, unsigned Align,
+                                    bool IsVolatile) {
+  auto &MF = *MI.getParent()->getParent();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+  auto &DL = MF.getDataLayout();
+  LLVMContext &C = MF.getFunction().getContext();
+
+  assert(KnownLen != 0 && "Have a zero length memset length!");
+
+  bool DstAlignCanChange = false;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool OptSize = shouldLowerMemFuncForSize(MF);
+
+  MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
+  if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
+    DstAlignCanChange = true;
+
+  unsigned Limit = TLI.getMaxStoresPerMemset(OptSize);
+  std::vector<LLT> MemOps;
+
+  const auto &DstMMO = **MI.memoperands_begin();
+  MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
+
+  auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
+  bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
+
+  if (!findGISelOptimalMemOpLowering(
+          MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), 0,
+          /*IsMemset=*/true,
+          /*ZeroMemset=*/IsZeroVal, /*MemcpyStrSrc=*/false,
+          /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(), ~0u,
+          MF.getFunction().getAttributes(), TLI))
+    return false;
+
+  if (DstAlignCanChange) {
+    // Get an estimate of the type from the LLT.
+    Type *IRTy = getTypeForLLT(MemOps[0], C);
+    unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
+    if (NewAlign > Align) {
+      Align = NewAlign;
+      unsigned FI = FIDef->getOperand(1).getIndex();
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI.getObjectAlignment(FI) < Align)
+        MFI.setObjectAlignment(FI, Align);
+    }
+  }
+
+  MachineIRBuilder MIB(MI);
+  // Find the largest store and generate the bit pattern for it.
+  LLT LargestTy = MemOps[0];
+  for (unsigned i = 1; i < MemOps.size(); i++)
+    if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits())
+      LargestTy = MemOps[i];
+
+  // The memset stored value is always defined as an s8, so in order to make it
+  // work with larger store types we need to repeat the bit pattern across the
+  // wider type.
+  Register MemSetValue = getMemsetValue(Val, LargestTy, MIB);
+
+  if (!MemSetValue)
+    return false;
+
+  // Generate the stores. For each store type in the list, we generate the
+  // matching store of that type to the destination address.
+  LLT PtrTy = MRI.getType(Dst);
+  unsigned DstOff = 0;
+  unsigned Size = KnownLen;
+  for (unsigned I = 0; I < MemOps.size(); I++) {
+    LLT Ty = MemOps[I];
+    unsigned TySize = Ty.getSizeInBytes();
+    if (TySize > Size) {
+      // Issuing an unaligned load / store pair that overlaps with the previous
+      // pair. Adjust the offset accordingly.
+      assert(I == MemOps.size() - 1 && I != 0);
+      DstOff -= TySize - Size;
+    }
+
+    // If this store is smaller than the largest store see whether we can get
+    // the smaller value for free with a truncate.
+    Register Value = MemSetValue;
+    if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) {
+      MVT VT = getMVTForLLT(Ty);
+      MVT LargestVT = getMVTForLLT(LargestTy);
+      if (!LargestTy.isVector() && !Ty.isVector() &&
+          TLI.isTruncateFree(LargestVT, VT))
+        Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0);
+      else
+        Value = getMemsetValue(Val, Ty, MIB);
+      if (!Value)
+        return false;
+    }
+
+    auto *StoreMMO =
+        MF.getMachineMemOperand(&DstMMO, DstOff, Ty.getSizeInBytes());
+
+    Register Ptr = Dst;
+    if (DstOff != 0) {
+      auto Offset =
+          MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff);
+      Ptr = MIB.buildGEP(PtrTy, Dst, Offset).getReg(0);
+    }
+
+    MIB.buildStore(Value, Ptr, *StoreMMO);
+    DstOff += Ty.getSizeInBytes();
+    Size -= TySize;
+  }
+
+  MI.eraseFromParent();
+  return true;
+}
+
+
+bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst,
+                                    Register Src, unsigned KnownLen,
+                                    unsigned DstAlign, unsigned SrcAlign,
+                                    bool IsVolatile) {
+  auto &MF = *MI.getParent()->getParent();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+  auto &DL = MF.getDataLayout();
+  LLVMContext &C = MF.getFunction().getContext();
+
+  assert(KnownLen != 0 && "Have a zero length memcpy length!");
+
+  bool DstAlignCanChange = false;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool OptSize = shouldLowerMemFuncForSize(MF);
+  unsigned Alignment = MinAlign(DstAlign, SrcAlign);
+
+  MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
+  if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
+    DstAlignCanChange = true;
+
+  // FIXME: infer better src pointer alignment like SelectionDAG does here.
+  // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining
+  // if the memcpy is in a tail call position.
+
+  unsigned Limit = TLI.getMaxStoresPerMemcpy(OptSize);
+  std::vector<LLT> MemOps;
+
+  const auto &DstMMO = **MI.memoperands_begin();
+  const auto &SrcMMO = **std::next(MI.memoperands_begin());
+  MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
+  MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
+
+  if (!findGISelOptimalMemOpLowering(
+          MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Alignment),
+          SrcAlign,
+          /*IsMemset=*/false,
+          /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false,
+          /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(),
+          SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI))
+    return false;
+
+  if (DstAlignCanChange) {
+    // Get an estimate of the type from the LLT.
+    Type *IRTy = getTypeForLLT(MemOps[0], C);
+    unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
+
+    // Don't promote to an alignment that would require dynamic stack
+    // realignment.
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+    if (!TRI->needsStackRealignment(MF))
+      while (NewAlign > Alignment &&
+             DL.exceedsNaturalStackAlignment(Align(NewAlign)))
+        NewAlign /= 2;
+
+    if (NewAlign > Alignment) {
+      Alignment = NewAlign;
+      unsigned FI = FIDef->getOperand(1).getIndex();
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI.getObjectAlignment(FI) < Alignment)
+        MFI.setObjectAlignment(FI, Alignment);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n");
+
+  MachineIRBuilder MIB(MI);
+  // Now we need to emit a pair of load and stores for each of the types we've
+  // collected. I.e. for each type, generate a load from the source pointer of
+  // that type width, and then generate a corresponding store to the dest buffer
+  // of that value loaded. This can result in a sequence of loads and stores
+  // mixed types, depending on what the target specifies as good types to use.
+  unsigned CurrOffset = 0;
+  LLT PtrTy = MRI.getType(Src);
+  unsigned Size = KnownLen;
+  for (auto CopyTy : MemOps) {
+    // Issuing an unaligned load / store pair  that overlaps with the previous
+    // pair. Adjust the offset accordingly.
+    if (CopyTy.getSizeInBytes() > Size)
+      CurrOffset -= CopyTy.getSizeInBytes() - Size;
+
+    // Construct MMOs for the accesses.
+    auto *LoadMMO =
+        MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
+    auto *StoreMMO = 
+        MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
+
+    // Create the load.
+    Register LoadPtr = Src;
+    Register Offset;
+    if (CurrOffset != 0) {
+      Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset)
+                   .getReg(0);
+      LoadPtr = MIB.buildGEP(PtrTy, Src, Offset).getReg(0);
+    }
+    auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO);
+
+    // Create the store.
+    Register StorePtr =
+        CurrOffset == 0 ? Dst : MIB.buildGEP(PtrTy, Dst, Offset).getReg(0);
+    MIB.buildStore(LdVal, StorePtr, *StoreMMO);
+    CurrOffset += CopyTy.getSizeInBytes();
+    Size -= CopyTy.getSizeInBytes();
+  }
+
+  MI.eraseFromParent();
   return true;
 }
 
+bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst,
+                                    Register Src, unsigned KnownLen,
+                                    unsigned DstAlign, unsigned SrcAlign,
+                                    bool IsVolatile) {
+  auto &MF = *MI.getParent()->getParent();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+  auto &DL = MF.getDataLayout();
+  LLVMContext &C = MF.getFunction().getContext();
+
+  assert(KnownLen != 0 && "Have a zero length memmove length!");
+
+  bool DstAlignCanChange = false;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool OptSize = shouldLowerMemFuncForSize(MF);
+  unsigned Alignment = MinAlign(DstAlign, SrcAlign);
+
+  MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
+  if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
+    DstAlignCanChange = true;
+
+  unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize);
+  std::vector<LLT> MemOps;
+
+  const auto &DstMMO = **MI.memoperands_begin();
+  const auto &SrcMMO = **std::next(MI.memoperands_begin());
+  MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo();
+  MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
+
+  // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due
+  // to a bug in it's findOptimalMemOpLowering implementation. For now do the
+  // same thing here.
+  if (!findGISelOptimalMemOpLowering(
+          MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Alignment),
+          SrcAlign,
+          /*IsMemset=*/false,
+          /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false,
+          /*AllowOverlap=*/false, DstPtrInfo.getAddrSpace(),
+          SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI))
+    return false;
+
+  if (DstAlignCanChange) {
+    // Get an estimate of the type from the LLT.
+    Type *IRTy = getTypeForLLT(MemOps[0], C);
+    unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
+
+    // Don't promote to an alignment that would require dynamic stack
+    // realignment.
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+    if (!TRI->needsStackRealignment(MF))
+      while (NewAlign > Alignment &&
+             DL.exceedsNaturalStackAlignment(Align(NewAlign)))
+        NewAlign /= 2;
+
+    if (NewAlign > Alignment) {
+      Alignment = NewAlign;
+      unsigned FI = FIDef->getOperand(1).getIndex();
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI.getObjectAlignment(FI) < Alignment)
+        MFI.setObjectAlignment(FI, Alignment);
+    }
+  }
+
+  LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n");
+
+  MachineIRBuilder MIB(MI);
+  // Memmove requires that we perform the loads first before issuing the stores.
+  // Apart from that, this loop is pretty much doing the same thing as the
+  // memcpy codegen function.
+  unsigned CurrOffset = 0;
+  LLT PtrTy = MRI.getType(Src);
+  SmallVector<Register, 16> LoadVals;
+  for (auto CopyTy : MemOps) {
+    // Construct MMO for the load.
+    auto *LoadMMO =
+        MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
+
+    // Create the load.
+    Register LoadPtr = Src;
+    if (CurrOffset != 0) {
+      auto Offset =
+          MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
+      LoadPtr = MIB.buildGEP(PtrTy, Src, Offset).getReg(0);
+    }
+    LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0));
+    CurrOffset += CopyTy.getSizeInBytes();
+  }
+
+  CurrOffset = 0;
+  for (unsigned I = 0; I < MemOps.size(); ++I) {
+    LLT CopyTy = MemOps[I];
+    // Now store the values loaded.
+    auto *StoreMMO =
+        MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
+
+    Register StorePtr = Dst;
+    if (CurrOffset != 0) {
+      auto Offset =
+          MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset);
+      StorePtr = MIB.buildGEP(PtrTy, Dst, Offset).getReg(0);
+    }
+    MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO);
+    CurrOffset += CopyTy.getSizeInBytes();
+  }
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
+  // This combine is fairly complex so it's not written with a separate
+  // matcher function.
+  assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
+  Intrinsic::ID ID = (Intrinsic::ID)MI.getIntrinsicID();
+  assert((ID == Intrinsic::memcpy || ID == Intrinsic::memmove ||
+          ID == Intrinsic::memset) &&
+         "Expected a memcpy like intrinsic");
+
+  auto MMOIt = MI.memoperands_begin();
+  const MachineMemOperand *MemOp = *MMOIt;
+  bool IsVolatile = MemOp->isVolatile();
+  // Don't try to optimize volatile.
+  if (IsVolatile)
+    return false;
+
+  unsigned DstAlign = MemOp->getBaseAlignment();
+  unsigned SrcAlign = 0;
+  Register Dst = MI.getOperand(1).getReg();
+  Register Src = MI.getOperand(2).getReg();
+  Register Len = MI.getOperand(3).getReg();
+
+  if (ID != Intrinsic::memset) {
+    assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
+    MemOp = *(++MMOIt);
+    SrcAlign = MemOp->getBaseAlignment();
+  }
+
+  // See if this is a constant length copy
+  auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI);
+  if (!LenVRegAndVal)
+    return false; // Leave it to the legalizer to lower it to a libcall.
+  unsigned KnownLen = LenVRegAndVal->Value;
+
+  if (KnownLen == 0) {
+    MI.eraseFromParent();
+    return true;
+  }
+
+  if (MaxLen && KnownLen > MaxLen)
+    return false;
+
+  if (ID == Intrinsic::memcpy)
+    return optimizeMemcpy(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
+  if (ID == Intrinsic::memmove)
+    return optimizeMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile);
+  if (ID == Intrinsic::memset)
+    return optimizeMemset(MI, Dst, Src, KnownLen, DstAlign, IsVolatile);
+  return false;
+}
+
 bool CombinerHelper::tryCombine(MachineInstr &MI) {
   if (tryCombineCopy(MI))
     return true;
-  return tryCombineExtendingLoads(MI);
+  if (tryCombineExtendingLoads(MI))
+    return true;
+  if (tryCombineIndexedLoadStore(MI))
+    return true;
+  return false;
 }
diff --git a/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
new file mode 100644
index 000000000000..be8efa8795f3
--- /dev/null
+++ b/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -0,0 +1,383 @@
+//===- lib/CodeGen/GlobalISel/GISelKnownBits.cpp --------------*- C++ *-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// Provides analysis for querying information about KnownBits during GISel
+/// passes.
+//
+//===------------------
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+
+#define DEBUG_TYPE "gisel-known-bits"
+
+using namespace llvm;
+
+char llvm::GISelKnownBitsAnalysis::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GISelKnownBitsAnalysis, DEBUG_TYPE,
+                      "Analysis for ComputingKnownBits", false, true)
+INITIALIZE_PASS_END(GISelKnownBitsAnalysis, DEBUG_TYPE,
+                    "Analysis for ComputingKnownBits", false, true)
+
+GISelKnownBits::GISelKnownBits(MachineFunction &MF)
+    : MF(MF), MRI(MF.getRegInfo()), TL(*MF.getSubtarget().getTargetLowering()),
+      DL(MF.getFunction().getParent()->getDataLayout()) {}
+
+Align GISelKnownBits::inferAlignmentForFrameIdx(int FrameIdx, int Offset,
+                                                const MachineFunction &MF) {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  return commonAlignment(Align(MFI.getObjectAlignment(FrameIdx)), Offset);
+  // TODO: How to handle cases with Base + Offset?
+}
+
+MaybeAlign GISelKnownBits::inferPtrAlignment(const MachineInstr &MI) {
+  if (MI.getOpcode() == TargetOpcode::G_FRAME_INDEX) {
+    int FrameIdx = MI.getOperand(1).getIndex();
+    return inferAlignmentForFrameIdx(FrameIdx, 0, *MI.getMF());
+  }
+  return None;
+}
+
+void GISelKnownBits::computeKnownBitsForFrameIndex(Register R, KnownBits &Known,
+                                                   const APInt &DemandedElts,
+                                                   unsigned Depth) {
+  const MachineInstr &MI = *MRI.getVRegDef(R);
+  computeKnownBitsForAlignment(Known, inferPtrAlignment(MI));
+}
+
+void GISelKnownBits::computeKnownBitsForAlignment(KnownBits &Known,
+                                                  MaybeAlign Alignment) {
+  if (Alignment)
+    // The low bits are known zero if the pointer is aligned.
+    Known.Zero.setLowBits(Log2(Alignment));
+}
+
+KnownBits GISelKnownBits::getKnownBits(MachineInstr &MI) {
+  return getKnownBits(MI.getOperand(0).getReg());
+}
+
+KnownBits GISelKnownBits::getKnownBits(Register R) {
+  KnownBits Known;
+  LLT Ty = MRI.getType(R);
+  APInt DemandedElts =
+      Ty.isVector() ? APInt::getAllOnesValue(Ty.getNumElements()) : APInt(1, 1);
+  computeKnownBitsImpl(R, Known, DemandedElts);
+  return Known;
+}
+
+bool GISelKnownBits::signBitIsZero(Register R) {
+  LLT Ty = MRI.getType(R);
+  unsigned BitWidth = Ty.getScalarSizeInBits();
+  return maskedValueIsZero(R, APInt::getSignMask(BitWidth));
+}
+
+APInt GISelKnownBits::getKnownZeroes(Register R) {
+  return getKnownBits(R).Zero;
+}
+
+APInt GISelKnownBits::getKnownOnes(Register R) { return getKnownBits(R).One; }
+
+void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
+                                          const APInt &DemandedElts,
+                                          unsigned Depth) {
+  MachineInstr &MI = *MRI.getVRegDef(R);
+  unsigned Opcode = MI.getOpcode();
+  LLT DstTy = MRI.getType(R);
+
+  // Handle the case where this is called on a register that does not have a
+  // type constraint (i.e. it has a register class constraint instead). This is
+  // unlikely to occur except by looking through copies but it is possible for
+  // the initial register being queried to be in this state.
+  if (!DstTy.isValid()) {
+    Known = KnownBits();
+    return;
+  }
+
+  unsigned BitWidth = DstTy.getSizeInBits();
+  Known = KnownBits(BitWidth); // Don't know anything
+
+  if (DstTy.isVector())
+    return; // TODO: Handle vectors.
+
+  if (Depth == getMaxDepth())
+    return;
+
+  if (!DemandedElts)
+    return; // No demanded elts, better to assume we don't know anything.
+
+  KnownBits Known2;
+
+  switch (Opcode) {
+  default:
+    TL.computeKnownBitsForTargetInstr(*this, R, Known, DemandedElts, MRI,
+                                      Depth);
+    break;
+  case TargetOpcode::COPY: {
+    MachineOperand Dst = MI.getOperand(0);
+    MachineOperand Src = MI.getOperand(1);
+    // Look through trivial copies but don't look through trivial copies of the
+    // form `%1:(s32) = OP %0:gpr32` known-bits analysis is currently unable to
+    // determine the bit width of a register class.
+    //
+    // We can't use NoSubRegister by name as it's defined by each target but
+    // it's always defined to be 0 by tablegen.
+    if (Dst.getSubReg() == 0 /*NoSubRegister*/ && Src.getReg().isVirtual() &&
+        Src.getSubReg() == 0 /*NoSubRegister*/ &&
+        MRI.getType(Src.getReg()).isValid()) {
+      // Don't increment Depth for this one since we didn't do any work.
+      computeKnownBitsImpl(Src.getReg(), Known, DemandedElts, Depth);
+    }
+    break;
+  }
+  case TargetOpcode::G_CONSTANT: {
+    auto CstVal = getConstantVRegVal(R, MRI);
+    if (!CstVal)
+      break;
+    Known.One = *CstVal;
+    Known.Zero = ~Known.One;
+    break;
+  }
+  case TargetOpcode::G_FRAME_INDEX: {
+    computeKnownBitsForFrameIndex(R, Known, DemandedElts);
+    break;
+  }
+  case TargetOpcode::G_SUB: {
+    // If low bits are known to be zero in both operands, then we know they are
+    // going to be 0 in the result. Both addition and complement operations
+    // preserve the low zero bits.
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    unsigned KnownZeroLow = Known2.countMinTrailingZeros();
+    if (KnownZeroLow == 0)
+      break;
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
+    Known.Zero.setLowBits(KnownZeroLow);
+    break;
+  }
+  case TargetOpcode::G_XOR: {
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    APInt KnownZeroOut = (Known.Zero & Known2.Zero) | (Known.One & Known2.One);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    Known.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero);
+    Known.Zero = KnownZeroOut;
+    break;
+  }
+  case TargetOpcode::G_GEP: {
+    // G_GEP is like G_ADD. FIXME: Is this true for all targets?
+    LLT Ty = MRI.getType(MI.getOperand(1).getReg());
+    if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
+      break;
+    LLVM_FALLTHROUGH;
+  }
+  case TargetOpcode::G_ADD: {
+    // Output known-0 bits are known if clear or set in both the low clear bits
+    // common to both LHS & RHS.  For example, 8+(X<<3) is known to have the
+    // low 3 bits clear.
+    // Output known-0 bits are also known if the top bits of each input are
+    // known to be clear. For example, if one input has the top 10 bits clear
+    // and the other has the top 8 bits clear, we know the top 7 bits of the
+    // output must be clear.
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    unsigned KnownZeroHigh = Known2.countMinLeadingZeros();
+    unsigned KnownZeroLow = Known2.countMinTrailingZeros();
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    KnownZeroHigh = std::min(KnownZeroHigh, Known2.countMinLeadingZeros());
+    KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
+    Known.Zero.setLowBits(KnownZeroLow);
+    if (KnownZeroHigh > 1)
+      Known.Zero.setHighBits(KnownZeroHigh - 1);
+    break;
+  }
+  case TargetOpcode::G_AND: {
+    // If either the LHS or the RHS are Zero, the result is zero.
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    Known.One &= Known2.One;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    Known.Zero |= Known2.Zero;
+    break;
+  }
+  case TargetOpcode::G_OR: {
+    // If either the LHS or the RHS are Zero, the result is zero.
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    Known.Zero &= Known2.Zero;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    Known.One |= Known2.One;
+    break;
+  }
+  case TargetOpcode::G_MUL: {
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    // If low bits are zero in either operand, output low known-0 bits.
+    // Also compute a conservative estimate for high known-0 bits.
+    // More trickiness is possible, but this is sufficient for the
+    // interesting case of alignment computation.
+    unsigned TrailZ =
+        Known.countMinTrailingZeros() + Known2.countMinTrailingZeros();
+    unsigned LeadZ =
+        std::max(Known.countMinLeadingZeros() + Known2.countMinLeadingZeros(),
+                 BitWidth) -
+        BitWidth;
+
+    Known.resetAll();
+    Known.Zero.setLowBits(std::min(TrailZ, BitWidth));
+    Known.Zero.setHighBits(std::min(LeadZ, BitWidth));
+    break;
+  }
+  case TargetOpcode::G_SELECT: {
+    computeKnownBitsImpl(MI.getOperand(3).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    // If we don't know any bits, early out.
+    if (Known.isUnknown())
+      break;
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), Known2, DemandedElts,
+                         Depth + 1);
+    // Only known if known in both the LHS and RHS.
+    Known.One &= Known2.One;
+    Known.Zero &= Known2.Zero;
+    break;
+  }
+  case TargetOpcode::G_FCMP:
+  case TargetOpcode::G_ICMP: {
+    if (TL.getBooleanContents(DstTy.isVector(),
+                              Opcode == TargetOpcode::G_FCMP) ==
+            TargetLowering::ZeroOrOneBooleanContent &&
+        BitWidth > 1)
+      Known.Zero.setBitsFrom(1);
+    break;
+  }
+  case TargetOpcode::G_SEXT: {
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    // If the sign bit is known to be zero or one, then sext will extend
+    // it to the top bits, else it will just zext.
+    Known = Known.sext(BitWidth);
+    break;
+  }
+  case TargetOpcode::G_ANYEXT: {
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+                         Depth + 1);
+    Known = Known.zext(BitWidth, true /* ExtendedBitsAreKnownZero */);
+    break;
+  }
+  case TargetOpcode::G_LOAD: {
+    if (MI.hasOneMemOperand()) {
+      const MachineMemOperand *MMO = *MI.memoperands_begin();
+      if (const MDNode *Ranges = MMO->getRanges()) {
+        computeKnownBitsFromRangeMetadata(*Ranges, Known);
+      }
+    }
+    break;
+  }
+  case TargetOpcode::G_ZEXTLOAD: {
+    // Everything above the retrieved bits is zero
+    if (MI.hasOneMemOperand())
+      Known.Zero.setBitsFrom((*MI.memoperands_begin())->getSizeInBits());
+    break;
+  }
+  case TargetOpcode::G_ASHR:
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_SHL: {
+    KnownBits RHSKnown;
+    computeKnownBitsImpl(MI.getOperand(2).getReg(), RHSKnown, DemandedElts,
+                         Depth + 1);
+    if (!RHSKnown.isConstant()) {
+      LLVM_DEBUG(
+          MachineInstr *RHSMI = MRI.getVRegDef(MI.getOperand(2).getReg());
+          dbgs() << '[' << Depth << "] Shift not known constant: " << *RHSMI);
+      break;
+    }
+    uint64_t Shift = RHSKnown.getConstant().getZExtValue();
+    LLVM_DEBUG(dbgs() << '[' << Depth << "] Shift is " << Shift << '\n');
+
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
+                         Depth + 1);
+
+    switch (Opcode) {
+    case TargetOpcode::G_ASHR:
+      Known.Zero = Known.Zero.ashr(Shift);
+      Known.One = Known.One.ashr(Shift);
+      break;
+    case TargetOpcode::G_LSHR:
+      Known.Zero = Known.Zero.lshr(Shift);
+      Known.One = Known.One.lshr(Shift);
+      Known.Zero.setBitsFrom(Known.Zero.getBitWidth() - Shift);
+      break;
+    case TargetOpcode::G_SHL:
+      Known.Zero = Known.Zero.shl(Shift);
+      Known.One = Known.One.shl(Shift);
+      Known.Zero.setBits(0, Shift);
+      break;
+    }
+    break;
+  }
+  case TargetOpcode::G_INTTOPTR:
+  case TargetOpcode::G_PTRTOINT:
+    // Fall through and handle them the same as zext/trunc.
+    LLVM_FALLTHROUGH;
+  case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_TRUNC: {
+    Register SrcReg = MI.getOperand(1).getReg();
+    LLT SrcTy = MRI.getType(SrcReg);
+    unsigned SrcBitWidth = SrcTy.isPointer()
+                               ? DL.getIndexSizeInBits(SrcTy.getAddressSpace())
+                               : SrcTy.getSizeInBits();
+    assert(SrcBitWidth && "SrcBitWidth can't be zero");
+    Known = Known.zextOrTrunc(SrcBitWidth, true);
+    computeKnownBitsImpl(SrcReg, Known, DemandedElts, Depth + 1);
+    Known = Known.zextOrTrunc(BitWidth, true);
+    if (BitWidth > SrcBitWidth)
+      Known.Zero.setBitsFrom(SrcBitWidth);
+    break;
+  }
+  }
+
+  assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+  LLVM_DEBUG(dbgs() << "[" << Depth << "] Compute known bits: " << MI << "["
+                    << Depth << "] Computed for: " << MI << "[" << Depth
+                    << "] Known: 0x"
+                    << (Known.Zero | Known.One).toString(16, false) << "\n"
+                    << "[" << Depth << "] Zero: 0x"
+                    << Known.Zero.toString(16, false) << "\n"
+                    << "[" << Depth << "] One:  0x"
+                    << Known.One.toString(16, false) << "\n");
+}
+
+void GISelKnownBitsAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool GISelKnownBitsAnalysis::runOnMachineFunction(MachineFunction &MF) {
+  return false;
+}
diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 6e99bdbd8264..45cef4aca888 100644
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -32,6 +32,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/StackProtector.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -334,7 +335,7 @@ bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) {
 
 bool IRTranslator::translateCompare(const User &U,
                                     MachineIRBuilder &MIRBuilder) {
-  const CmpInst *CI = dyn_cast<CmpInst>(&U);
+  auto *CI = dyn_cast<CmpInst>(&U);
   Register Op0 = getOrCreateVReg(*U.getOperand(0));
   Register Op1 = getOrCreateVReg(*U.getOperand(1));
   Register Res = getOrCreateVReg(U);
@@ -345,11 +346,12 @@ bool IRTranslator::translateCompare(const User &U,
     MIRBuilder.buildICmp(Pred, Res, Op0, Op1);
   else if (Pred == CmpInst::FCMP_FALSE)
     MIRBuilder.buildCopy(
-        Res, getOrCreateVReg(*Constant::getNullValue(CI->getType())));
+        Res, getOrCreateVReg(*Constant::getNullValue(U.getType())));
   else if (Pred == CmpInst::FCMP_TRUE)
     MIRBuilder.buildCopy(
-        Res, getOrCreateVReg(*Constant::getAllOnesValue(CI->getType())));
+        Res, getOrCreateVReg(*Constant::getAllOnesValue(U.getType())));
   else {
+    assert(CI && "Instruction should be CmpInst");
     MIRBuilder.buildInstr(TargetOpcode::G_FCMP, {Res}, {Pred, Op0, Op1},
                           MachineInstr::copyFlagsFromInstruction(*CI));
   }
@@ -588,8 +590,8 @@ void IRTranslator::emitSwitchCase(SwitchCG::CaseBlock &CB,
     Register CondRHS = getOrCreateVReg(*CB.CmpRHS);
     Cond = MIB.buildICmp(CB.PredInfo.Pred, i1Ty, CondLHS, CondRHS).getReg(0);
   } else {
-    assert(CB.PredInfo.Pred == CmpInst::ICMP_ULE &&
-           "Can only handle ULE ranges");
+    assert(CB.PredInfo.Pred == CmpInst::ICMP_SLE &&
+           "Can only handle SLE ranges");
 
     const APInt& Low = cast<ConstantInt>(CB.CmpLHS)->getValue();
     const APInt& High = cast<ConstantInt>(CB.CmpRHS)->getValue();
@@ -598,7 +600,7 @@ void IRTranslator::emitSwitchCase(SwitchCG::CaseBlock &CB,
     if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(true)) {
       Register CondRHS = getOrCreateVReg(*CB.CmpRHS);
       Cond =
-          MIB.buildICmp(CmpInst::ICMP_ULE, i1Ty, CmpOpReg, CondRHS).getReg(0);
+          MIB.buildICmp(CmpInst::ICMP_SLE, i1Ty, CmpOpReg, CondRHS).getReg(0);
     } else {
       const LLT &CmpTy = MRI->getType(CmpOpReg);
       auto Sub = MIB.buildSub({CmpTy}, CmpOpReg, CondLHS);
@@ -728,7 +730,7 @@ bool IRTranslator::lowerSwitchRangeWorkItem(SwitchCG::CaseClusterIt I,
     MHS = nullptr;
   } else {
     // Check I->Low <= Cond <= I->High.
-    Pred = CmpInst::ICMP_ULE;
+    Pred = CmpInst::ICMP_SLE;
     LHS = I->Low;
     MHS = Cond;
     RHS = I->High;
@@ -879,7 +881,8 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
     return true;
   }
 
-
+  const MDNode *Ranges =
+      Regs.size() == 1 ? LI.getMetadata(LLVMContext::MD_range) : nullptr;
   for (unsigned i = 0; i < Regs.size(); ++i) {
     Register Addr;
     MIRBuilder.materializeGEP(Addr, Base, OffsetTy, Offsets[i] / 8);
@@ -888,7 +891,7 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
     unsigned BaseAlign = getMemOpAlignment(LI);
     auto MMO = MF->getMachineMemOperand(
         Ptr, Flags, (MRI->getType(Regs[i]).getSizeInBits() + 7) / 8,
-        MinAlign(BaseAlign, Offsets[i] / 8), AAMDNodes(), nullptr,
+        MinAlign(BaseAlign, Offsets[i] / 8), AAMDNodes(), Ranges,
         LI.getSyncScopeID(), LI.getOrdering());
     MIRBuilder.buildLoad(Regs[i], Addr, *MMO);
   }
@@ -1075,36 +1078,29 @@ bool IRTranslator::translateGetElementPtr(const User &U,
       }
 
       if (Offset != 0) {
-        Register NewBaseReg = MRI->createGenericVirtualRegister(PtrTy);
         LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);
         auto OffsetMIB = MIRBuilder.buildConstant({OffsetTy}, Offset);
-        MIRBuilder.buildGEP(NewBaseReg, BaseReg, OffsetMIB.getReg(0));
-
-        BaseReg = NewBaseReg;
+        BaseReg =
+            MIRBuilder.buildGEP(PtrTy, BaseReg, OffsetMIB.getReg(0)).getReg(0);
         Offset = 0;
       }
 
       Register IdxReg = getOrCreateVReg(*Idx);
-      if (MRI->getType(IdxReg) != OffsetTy) {
-        Register NewIdxReg = MRI->createGenericVirtualRegister(OffsetTy);
-        MIRBuilder.buildSExtOrTrunc(NewIdxReg, IdxReg);
-        IdxReg = NewIdxReg;
-      }
+      if (MRI->getType(IdxReg) != OffsetTy)
+        IdxReg = MIRBuilder.buildSExtOrTrunc(OffsetTy, IdxReg).getReg(0);
 
       // N = N + Idx * ElementSize;
       // Avoid doing it for ElementSize of 1.
       Register GepOffsetReg;
       if (ElementSize != 1) {
-        GepOffsetReg = MRI->createGenericVirtualRegister(OffsetTy);
         auto ElementSizeMIB = MIRBuilder.buildConstant(
             getLLTForType(*OffsetIRTy, *DL), ElementSize);
-        MIRBuilder.buildMul(GepOffsetReg, ElementSizeMIB.getReg(0), IdxReg);
+        GepOffsetReg =
+            MIRBuilder.buildMul(OffsetTy, ElementSizeMIB, IdxReg).getReg(0);
       } else
         GepOffsetReg = IdxReg;
 
-      Register NewBaseReg = MRI->createGenericVirtualRegister(PtrTy);
-      MIRBuilder.buildGEP(NewBaseReg, BaseReg, GepOffsetReg);
-      BaseReg = NewBaseReg;
+      BaseReg = MIRBuilder.buildGEP(PtrTy, BaseReg, GepOffsetReg).getReg(0);
     }
   }
 
@@ -1119,54 +1115,51 @@ bool IRTranslator::translateGetElementPtr(const User &U,
   return true;
 }
 
-bool IRTranslator::translateMemfunc(const CallInst &CI,
+bool IRTranslator::translateMemFunc(const CallInst &CI,
                                     MachineIRBuilder &MIRBuilder,
-                                    unsigned ID) {
+                                    Intrinsic::ID ID) {
 
   // If the source is undef, then just emit a nop.
-  if (isa<UndefValue>(CI.getArgOperand(1))) {
-    switch (ID) {
-    case Intrinsic::memmove:
-    case Intrinsic::memcpy:
-    case Intrinsic::memset:
-      return true;
-    default:
-      break;
-    }
-  }
-
-  LLT SizeTy = getLLTForType(*CI.getArgOperand(2)->getType(), *DL);
-  Type *DstTy = CI.getArgOperand(0)->getType();
-  if (cast<PointerType>(DstTy)->getAddressSpace() != 0 ||
-      SizeTy.getSizeInBits() != DL->getPointerSizeInBits(0))
-    return false;
+  if (isa<UndefValue>(CI.getArgOperand(1)))
+    return true;
 
-  SmallVector<CallLowering::ArgInfo, 8> Args;
-  for (int i = 0; i < 3; ++i) {
-    const auto &Arg = CI.getArgOperand(i);
-    Args.emplace_back(getOrCreateVReg(*Arg), Arg->getType());
+  ArrayRef<Register> Res;
+  auto ICall = MIRBuilder.buildIntrinsic(ID, Res, true);
+  for (auto AI = CI.arg_begin(), AE = CI.arg_end(); std::next(AI) != AE; ++AI)
+    ICall.addUse(getOrCreateVReg(**AI));
+
+  unsigned DstAlign = 0, SrcAlign = 0;
+  unsigned IsVol =
+      cast<ConstantInt>(CI.getArgOperand(CI.getNumArgOperands() - 1))
+          ->getZExtValue();
+
+  if (auto *MCI = dyn_cast<MemCpyInst>(&CI)) {
+    DstAlign = std::max<unsigned>(MCI->getDestAlignment(), 1);
+    SrcAlign = std::max<unsigned>(MCI->getSourceAlignment(), 1);
+  } else if (auto *MMI = dyn_cast<MemMoveInst>(&CI)) {
+    DstAlign = std::max<unsigned>(MMI->getDestAlignment(), 1);
+    SrcAlign = std::max<unsigned>(MMI->getSourceAlignment(), 1);
+  } else {
+    auto *MSI = cast<MemSetInst>(&CI);
+    DstAlign = std::max<unsigned>(MSI->getDestAlignment(), 1);
   }
 
-  const char *Callee;
-  switch (ID) {
-  case Intrinsic::memmove:
-  case Intrinsic::memcpy: {
-    Type *SrcTy = CI.getArgOperand(1)->getType();
-    if(cast<PointerType>(SrcTy)->getAddressSpace() != 0)
-      return false;
-    Callee = ID == Intrinsic::memcpy ? "memcpy" : "memmove";
-    break;
-  }
-  case Intrinsic::memset:
-    Callee = "memset";
-    break;
-  default:
-    return false;
-  }
+  // We need to propagate the tail call flag from the IR inst as an argument.
+  // Otherwise, we have to pessimize and assume later that we cannot tail call
+  // any memory intrinsics.
+  ICall.addImm(CI.isTailCall() ? 1 : 0);
 
-  return CLI->lowerCall(MIRBuilder, CI.getCallingConv(),
-                        MachineOperand::CreateES(Callee),
-                        CallLowering::ArgInfo({0}, CI.getType()), Args);
+  // Create mem operands to store the alignment and volatile info.
+  auto VolFlag = IsVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
+  ICall.addMemOperand(MF->getMachineMemOperand(
+      MachinePointerInfo(CI.getArgOperand(0)),
+      MachineMemOperand::MOStore | VolFlag, 1, DstAlign));
+  if (ID != Intrinsic::memset)
+    ICall.addMemOperand(MF->getMachineMemOperand(
+        MachinePointerInfo(CI.getArgOperand(1)),
+        MachineMemOperand::MOLoad | VolFlag, 1, SrcAlign));
+
+  return true;
 }
 
 void IRTranslator::getStackGuard(Register DstReg,
@@ -1186,7 +1179,7 @@ void IRTranslator::getStackGuard(Register DstReg,
                MachineMemOperand::MODereferenceable;
   MachineMemOperand *MemRef =
       MF->getMachineMemOperand(MPInfo, Flags, DL->getPointerSizeInBits() / 8,
-                               DL->getPointerABIAlignment(0));
+                               DL->getPointerABIAlignment(0).value());
   MIB.setMemRefs({MemRef});
 }
 
@@ -1208,6 +1201,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       break;
     case Intrinsic::bswap:
       return TargetOpcode::G_BSWAP;
+  case Intrinsic::bitreverse:
+      return TargetOpcode::G_BITREVERSE;
     case Intrinsic::ceil:
       return TargetOpcode::G_FCEIL;
     case Intrinsic::cos:
@@ -1383,16 +1378,17 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     if (!V) {
       // Currently the optimizer can produce this; insert an undef to
       // help debugging.  Probably the optimizer should not do this.
-      MIRBuilder.buildIndirectDbgValue(0, DI.getVariable(), DI.getExpression());
+      MIRBuilder.buildDirectDbgValue(0, DI.getVariable(), DI.getExpression());
     } else if (const auto *CI = dyn_cast<Constant>(V)) {
       MIRBuilder.buildConstDbgValue(*CI, DI.getVariable(), DI.getExpression());
     } else {
-      Register Reg = getOrCreateVReg(*V);
-      // FIXME: This does not handle register-indirect values at offset 0. The
-      // direct/indirect thing shouldn't really be handled by something as
-      // implicit as reg+noreg vs reg+imm in the first palce, but it seems
-      // pretty baked in right now.
-      MIRBuilder.buildDirectDbgValue(Reg, DI.getVariable(), DI.getExpression());
+      for (Register Reg : getOrCreateVRegs(*V)) {
+        // FIXME: This does not handle register-indirect values at offset 0. The
+        // direct/indirect thing shouldn't really be handled by something as
+        // implicit as reg+noreg vs reg+imm in the first place, but it seems
+        // pretty baked in right now.
+        MIRBuilder.buildDirectDbgValue(Reg, DI.getVariable(), DI.getExpression());
+      }
     }
     return true;
   }
@@ -1433,7 +1429,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
   case Intrinsic::memcpy:
   case Intrinsic::memmove:
   case Intrinsic::memset:
-    return translateMemfunc(CI, MIRBuilder, ID);
+    return translateMemFunc(CI, MIRBuilder, ID);
   case Intrinsic::eh_typeid_for: {
     GlobalValue *GV = ExtractTypeInfo(CI.getArgOperand(0));
     Register Reg = getOrCreateVReg(CI);
@@ -1441,18 +1437,12 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     MIRBuilder.buildConstant(Reg, TypeID);
     return true;
   }
-  case Intrinsic::objectsize: {
-    // If we don't know by now, we're never going to know.
-    const ConstantInt *Min = cast<ConstantInt>(CI.getArgOperand(1));
+  case Intrinsic::objectsize:
+    llvm_unreachable("llvm.objectsize.* should have been lowered already");
 
-    MIRBuilder.buildConstant(getOrCreateVReg(CI), Min->isZero() ? -1ULL : 0);
-    return true;
-  }
   case Intrinsic::is_constant:
-    // If this wasn't constant-folded away by now, then it's not a
-    // constant.
-    MIRBuilder.buildConstant(getOrCreateVReg(CI), 0);
-    return true;
+    llvm_unreachable("llvm.is.constant.* should have been lowered already");
+
   case Intrinsic::stackguard:
     getStackGuard(getOrCreateVReg(CI), MIRBuilder);
     return true;
@@ -1551,6 +1541,46 @@ bool IRTranslator::translateInlineAsm(const CallInst &CI,
   return true;
 }
 
+bool IRTranslator::translateCallSite(const ImmutableCallSite &CS,
+                                     MachineIRBuilder &MIRBuilder) {
+  const Instruction &I = *CS.getInstruction();
+  ArrayRef<Register> Res = getOrCreateVRegs(I);
+
+  SmallVector<ArrayRef<Register>, 8> Args;
+  Register SwiftInVReg = 0;
+  Register SwiftErrorVReg = 0;
+  for (auto &Arg : CS.args()) {
+    if (CLI->supportSwiftError() && isSwiftError(Arg)) {
+      assert(SwiftInVReg == 0 && "Expected only one swift error argument");
+      LLT Ty = getLLTForType(*Arg->getType(), *DL);
+      SwiftInVReg = MRI->createGenericVirtualRegister(Ty);
+      MIRBuilder.buildCopy(SwiftInVReg, SwiftError.getOrCreateVRegUseAt(
+                                            &I, &MIRBuilder.getMBB(), Arg));
+      Args.emplace_back(makeArrayRef(SwiftInVReg));
+      SwiftErrorVReg =
+          SwiftError.getOrCreateVRegDefAt(&I, &MIRBuilder.getMBB(), Arg);
+      continue;
+    }
+    Args.push_back(getOrCreateVRegs(*Arg));
+  }
+
+  // We don't set HasCalls on MFI here yet because call lowering may decide to
+  // optimize into tail calls. Instead, we defer that to selection where a final
+  // scan is done to check if any instructions are calls.
+  bool Success =
+      CLI->lowerCall(MIRBuilder, CS, Res, Args, SwiftErrorVReg,
+                     [&]() { return getOrCreateVReg(*CS.getCalledValue()); });
+
+  // Check if we just inserted a tail call.
+  if (Success) {
+    assert(!HasTailCall && "Can't tail call return twice from block?");
+    const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+    HasTailCall = TII->isTailCall(*std::prev(MIRBuilder.getInsertPt()));
+  }
+
+  return Success;
+}
+
 bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   const CallInst &CI = cast<CallInst>(U);
   auto TII = MF->getTarget().getIntrinsicInfo();
@@ -1570,34 +1600,8 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
       ID = static_cast<Intrinsic::ID>(TII->getIntrinsicID(F));
   }
 
-  if (!F || !F->isIntrinsic() || ID == Intrinsic::not_intrinsic) {
-    ArrayRef<Register> Res = getOrCreateVRegs(CI);
-
-    SmallVector<ArrayRef<Register>, 8> Args;
-    Register SwiftInVReg = 0;
-    Register SwiftErrorVReg = 0;
-    for (auto &Arg: CI.arg_operands()) {
-      if (CLI->supportSwiftError() && isSwiftError(Arg)) {
-        assert(SwiftInVReg == 0 && "Expected only one swift error argument");
-        LLT Ty = getLLTForType(*Arg->getType(), *DL);
-        SwiftInVReg = MRI->createGenericVirtualRegister(Ty);
-        MIRBuilder.buildCopy(SwiftInVReg, SwiftError.getOrCreateVRegUseAt(
-                                              &CI, &MIRBuilder.getMBB(), Arg));
-        Args.emplace_back(makeArrayRef(SwiftInVReg));
-        SwiftErrorVReg =
-            SwiftError.getOrCreateVRegDefAt(&CI, &MIRBuilder.getMBB(), Arg);
-        continue;
-      }
-      Args.push_back(getOrCreateVRegs(*Arg));
-    }
-
-    MF->getFrameInfo().setHasCalls(true);
-    bool Success =
-        CLI->lowerCall(MIRBuilder, &CI, Res, Args, SwiftErrorVReg,
-                       [&]() { return getOrCreateVReg(*CI.getCalledValue()); });
-
-    return Success;
-  }
+  if (!F || !F->isIntrinsic() || ID == Intrinsic::not_intrinsic)
+    return translateCallSite(&CI, MIRBuilder);
 
   assert(ID != Intrinsic::not_intrinsic && "unknown intrinsic");
 
@@ -1615,14 +1619,29 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   if (isa<FPMathOperator>(CI))
     MIB->copyIRFlags(CI);
 
-  for (auto &Arg : CI.arg_operands()) {
+  for (auto &Arg : enumerate(CI.arg_operands())) {
     // Some intrinsics take metadata parameters. Reject them.
-    if (isa<MetadataAsValue>(Arg))
-      return false;
-    ArrayRef<Register> VRegs = getOrCreateVRegs(*Arg);
-    if (VRegs.size() > 1)
+    if (isa<MetadataAsValue>(Arg.value()))
       return false;
-    MIB.addUse(VRegs[0]);
+
+    // If this is required to be an immediate, don't materialize it in a
+    // register.
+    if (CI.paramHasAttr(Arg.index(), Attribute::ImmArg)) {
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(Arg.value())) {
+        // imm arguments are more convenient than cimm (and realistically
+        // probably sufficient), so use them.
+        assert(CI->getBitWidth() <= 64 &&
+               "large intrinsic immediates not handled");
+        MIB.addImm(CI->getSExtValue());
+      } else {
+        MIB.addFPImm(cast<ConstantFP>(Arg.value()));
+      }
+    } else {
+      ArrayRef<Register> VRegs = getOrCreateVRegs(*Arg.value());
+      if (VRegs.size() > 1)
+        return false;
+      MIB.addUse(VRegs[0]);
+    }
   }
 
   // Add a MachineMemOperand if it is a target mem intrinsic.
@@ -1630,13 +1649,14 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   TargetLowering::IntrinsicInfo Info;
   // TODO: Add a GlobalISel version of getTgtMemIntrinsic.
   if (TLI.getTgtMemIntrinsic(Info, CI, *MF, ID)) {
-    unsigned Align = Info.align;
-    if (Align == 0)
-      Align = DL->getABITypeAlignment(Info.memVT.getTypeForEVT(F->getContext()));
+    MaybeAlign Align = Info.align;
+    if (!Align)
+      Align = MaybeAlign(
+          DL->getABITypeAlignment(Info.memVT.getTypeForEVT(F->getContext())));
 
     uint64_t Size = Info.memVT.getStoreSize();
-    MIB.addMemOperand(MF->getMachineMemOperand(MachinePointerInfo(Info.ptrVal),
-                                               Info.flags, Size, Align));
+    MIB.addMemOperand(MF->getMachineMemOperand(
+        MachinePointerInfo(Info.ptrVal), Info.flags, Size, Align->value()));
   }
 
   return true;
@@ -1672,30 +1692,7 @@ bool IRTranslator::translateInvoke(const User &U,
   MCSymbol *BeginSymbol = Context.createTempSymbol();
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(BeginSymbol);
 
-  ArrayRef<Register> Res;
-  if (!I.getType()->isVoidTy())
-    Res = getOrCreateVRegs(I);
-  SmallVector<ArrayRef<Register>, 8> Args;
-  Register SwiftErrorVReg = 0;
-  Register SwiftInVReg = 0;
-  for (auto &Arg : I.arg_operands()) {
-    if (CLI->supportSwiftError() && isSwiftError(Arg)) {
-      assert(SwiftInVReg == 0 && "Expected only one swift error argument");
-      LLT Ty = getLLTForType(*Arg->getType(), *DL);
-      SwiftInVReg = MRI->createGenericVirtualRegister(Ty);
-      MIRBuilder.buildCopy(SwiftInVReg, SwiftError.getOrCreateVRegUseAt(
-                                            &I, &MIRBuilder.getMBB(), Arg));
-      Args.push_back(makeArrayRef(SwiftInVReg));
-      SwiftErrorVReg =
-          SwiftError.getOrCreateVRegDefAt(&I, &MIRBuilder.getMBB(), Arg);
-      continue;
-    }
-
-    Args.push_back(getOrCreateVRegs(*Arg));
-  }
-
-  if (!CLI->lowerCall(MIRBuilder, &I, Res, Args, SwiftErrorVReg,
-                      [&]() { return getOrCreateVReg(*I.getCalledValue()); }))
+  if (!translateCallSite(&I, MIRBuilder))
     return false;
 
   MCSymbol *EndSymbol = Context.createTempSymbol();
@@ -1811,36 +1808,25 @@ bool IRTranslator::translateAlloca(const User &U,
 
   Register AllocSize = MRI->createGenericVirtualRegister(IntPtrTy);
   Register TySize =
-      getOrCreateVReg(*ConstantInt::get(IntPtrIRTy, -DL->getTypeAllocSize(Ty)));
+      getOrCreateVReg(*ConstantInt::get(IntPtrIRTy, DL->getTypeAllocSize(Ty)));
   MIRBuilder.buildMul(AllocSize, NumElts, TySize);
 
-  LLT PtrTy = getLLTForType(*AI.getType(), *DL);
-  auto &TLI = *MF->getSubtarget().getTargetLowering();
-  Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
-
-  Register SPTmp = MRI->createGenericVirtualRegister(PtrTy);
-  MIRBuilder.buildCopy(SPTmp, SPReg);
-
-  Register AllocTmp = MRI->createGenericVirtualRegister(PtrTy);
-  MIRBuilder.buildGEP(AllocTmp, SPTmp, AllocSize);
-
-  // Handle alignment. We have to realign if the allocation granule was smaller
-  // than stack alignment, or the specific alloca requires more than stack
-  // alignment.
   unsigned StackAlign =
       MF->getSubtarget().getFrameLowering()->getStackAlignment();
-  Align = std::max(Align, StackAlign);
-  if (Align > StackAlign || DL->getTypeAllocSize(Ty) % StackAlign != 0) {
-    // Round the size of the allocation up to the stack alignment size
-    // by add SA-1 to the size. This doesn't overflow because we're computing
-    // an address inside an alloca.
-    Register AlignedAlloc = MRI->createGenericVirtualRegister(PtrTy);
-    MIRBuilder.buildPtrMask(AlignedAlloc, AllocTmp, Log2_32(Align));
-    AllocTmp = AlignedAlloc;
-  }
+  if (Align <= StackAlign)
+    Align = 0;
+
+  // Round the size of the allocation up to the stack alignment size
+  // by add SA-1 to the size. This doesn't overflow because we're computing
+  // an address inside an alloca.
+  auto SAMinusOne = MIRBuilder.buildConstant(IntPtrTy, StackAlign - 1);
+  auto AllocAdd = MIRBuilder.buildAdd(IntPtrTy, AllocSize, SAMinusOne,
+                                      MachineInstr::NoUWrap);
+  auto AlignCst =
+      MIRBuilder.buildConstant(IntPtrTy, ~(uint64_t)(StackAlign - 1));
+  auto AlignedAlloc = MIRBuilder.buildAnd(IntPtrTy, AllocAdd, AlignCst);
 
-  MIRBuilder.buildCopy(SPReg, AllocTmp);
-  MIRBuilder.buildCopy(getOrCreateVReg(AI), AllocTmp);
+  MIRBuilder.buildDynStackAlloc(getOrCreateVReg(AI), AlignedAlloc, Align);
 
   MF->getFrameInfo().CreateVariableSizedObject(Align ? Align : 1, &AI);
   assert(MF->getFrameInfo().hasVarSizedObjects());
@@ -1926,7 +1912,7 @@ bool IRTranslator::translateShuffleVector(const User &U,
       .addDef(getOrCreateVReg(U))
       .addUse(getOrCreateVReg(*U.getOperand(0)))
       .addUse(getOrCreateVReg(*U.getOperand(1)))
-      .addUse(getOrCreateVReg(*U.getOperand(2)));
+      .addShuffleMask(cast<Constant>(U.getOperand(2)));
   return true;
 }
 
@@ -1991,7 +1977,6 @@ bool IRTranslator::translateAtomicRMW(const User &U,
   unsigned Opcode = 0;
   switch (I.getOperation()) {
   default:
-    llvm_unreachable("Unknown atomicrmw op");
     return false;
   case AtomicRMWInst::Xchg:
     Opcode = TargetOpcode::G_ATOMICRMW_XCHG;
@@ -2026,6 +2011,12 @@ bool IRTranslator::translateAtomicRMW(const User &U,
   case AtomicRMWInst::UMin:
     Opcode = TargetOpcode::G_ATOMICRMW_UMIN;
     break;
+  case AtomicRMWInst::FAdd:
+    Opcode = TargetOpcode::G_ATOMICRMW_FADD;
+    break;
+  case AtomicRMWInst::FSub:
+    Opcode = TargetOpcode::G_ATOMICRMW_FSUB;
+    break;
   }
 
   MIRBuilder.buildAtomicRMW(
@@ -2197,6 +2188,20 @@ void IRTranslator::finalizeFunction() {
   FuncInfo.clear();
 }
 
+/// Returns true if a BasicBlock \p BB within a variadic function contains a
+/// variadic musttail call.
+static bool checkForMustTailInVarArgFn(bool IsVarArg, const BasicBlock &BB) {
+  if (!IsVarArg)
+    return false;
+
+  // Walk the block backwards, because tail calls usually only appear at the end
+  // of a block.
+  return std::any_of(BB.rbegin(), BB.rend(), [](const Instruction &I) {
+    const auto *CI = dyn_cast<CallInst>(&I);
+    return CI && CI->isMustTailCall();
+  });
+}
+
 bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   MF = &CurMF;
   const Function &F = MF->getFunction();
@@ -2212,26 +2217,26 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
                        : TPC->isGISelCSEEnabled();
 
   if (EnableCSE) {
-    EntryBuilder = make_unique<CSEMIRBuilder>(CurMF);
+    EntryBuilder = std::make_unique<CSEMIRBuilder>(CurMF);
     CSEInfo = &Wrapper.get(TPC->getCSEConfig());
     EntryBuilder->setCSEInfo(CSEInfo);
-    CurBuilder = make_unique<CSEMIRBuilder>(CurMF);
+    CurBuilder = std::make_unique<CSEMIRBuilder>(CurMF);
     CurBuilder->setCSEInfo(CSEInfo);
   } else {
-    EntryBuilder = make_unique<MachineIRBuilder>();
-    CurBuilder = make_unique<MachineIRBuilder>();
+    EntryBuilder = std::make_unique<MachineIRBuilder>();
+    CurBuilder = std::make_unique<MachineIRBuilder>();
   }
   CLI = MF->getSubtarget().getCallLowering();
   CurBuilder->setMF(*MF);
   EntryBuilder->setMF(*MF);
   MRI = &MF->getRegInfo();
   DL = &F.getParent()->getDataLayout();
-  ORE = llvm::make_unique<OptimizationRemarkEmitter>(&F);
+  ORE = std::make_unique<OptimizationRemarkEmitter>(&F);
   FuncInfo.MF = MF;
   FuncInfo.BPI = nullptr;
   const auto &TLI = *MF->getSubtarget().getTargetLowering();
   const TargetMachine &TM = MF->getTarget();
-  SL = make_unique<GISelSwitchLowering>(this, FuncInfo);
+  SL = std::make_unique<GISelSwitchLowering>(this, FuncInfo);
   SL->init(TLI, TM, *DL);
 
   EnableOpts = TM.getOptLevel() != CodeGenOpt::None && !skipFunction(F);
@@ -2258,6 +2263,9 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   SwiftError.setFunction(CurMF);
   SwiftError.createEntriesInEntryBlock(DbgLoc);
 
+  bool IsVarArg = F.isVarArg();
+  bool HasMustTailInVarArgFn = false;
+
   // Create all blocks, in IR order, to preserve the layout.
   for (const BasicBlock &BB: F) {
     auto *&MBB = BBToMBB[&BB];
@@ -2267,8 +2275,13 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
 
     if (BB.hasAddressTaken())
       MBB->setHasAddressTaken();
+
+    if (!HasMustTailInVarArgFn)
+      HasMustTailInVarArgFn = checkForMustTailInVarArgFn(IsVarArg, BB);
   }
 
+  MF->getFrameInfo().setHasMustTailInVarArgFunc(HasMustTailInVarArgFn);
+
   // Make our arguments/constants entry block fallthrough to the IR entry block.
   EntryBB->addSuccessor(&getMBB(F.front()));
 
@@ -2286,18 +2299,6 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
     }
   }
 
-  // We don't currently support translating swifterror or swiftself functions.
-  for (auto &Arg : F.args()) {
-    if (Arg.hasSwiftSelfAttr()) {
-      OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
-                                 F.getSubprogram(), &F.getEntryBlock());
-      R << "unable to lower arguments due to swiftself: "
-        << ore::NV("Prototype", F.getType());
-      reportTranslationError(*MF, *TPC, *ORE, R);
-      return false;
-    }
-  }
-
   if (!CLI->lowerFormalArguments(*EntryBuilder.get(), F, VRegArgs)) {
     OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
                                F.getSubprogram(), &F.getEntryBlock());
@@ -2322,8 +2323,15 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
       // Set the insertion point of all the following translations to
       // the end of this basic block.
       CurBuilder->setMBB(MBB);
-
+      HasTailCall = false;
       for (const Instruction &Inst : *BB) {
+        // If we translated a tail call in the last step, then we know
+        // everything after the call is either a return, or something that is
+        // handled by the call itself. (E.g. a lifetime marker or assume
+        // intrinsic.) In this case, we should stop translating the block and
+        // move on.
+        if (HasTailCall)
+          break;
 #ifndef NDEBUG
         Verifier.setCurrentInst(&Inst);
 #endif // ifndef NDEBUG
diff --git a/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/lib/CodeGen/GlobalISel/InstructionSelect.cpp
index 70694fe6b6c8..7c4fd2d140d3 100644
--- a/lib/CodeGen/GlobalISel/InstructionSelect.cpp
+++ b/lib/CodeGen/GlobalISel/InstructionSelect.cpp
@@ -12,11 +12,14 @@
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -45,6 +48,7 @@ INITIALIZE_PASS_BEGIN(InstructionSelect, DEBUG_TYPE,
                       "Select target instructions out of generic instructions",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
 INITIALIZE_PASS_END(InstructionSelect, DEBUG_TYPE,
                     "Select target instructions out of generic instructions",
                     false, false)
@@ -53,6 +57,8 @@ InstructionSelect::InstructionSelect() : MachineFunctionPass(ID) { }
 
 void InstructionSelect::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetPassConfig>();
+  AU.addRequired<GISelKnownBitsAnalysis>();
+  AU.addPreserved<GISelKnownBitsAnalysis>();
   getSelectionDAGFallbackAnalysisUsage(AU);
   MachineFunctionPass::getAnalysisUsage(AU);
 }
@@ -64,11 +70,13 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   LLVM_DEBUG(dbgs() << "Selecting function: " << MF.getName() << '\n');
+  GISelKnownBits &KB = getAnalysis<GISelKnownBitsAnalysis>().get(MF);
 
   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
-  const InstructionSelector *ISel = MF.getSubtarget().getInstructionSelector();
+  InstructionSelector *ISel = MF.getSubtarget().getInstructionSelector();
   CodeGenCoverage CoverageInfo;
   assert(ISel && "Cannot work without InstructionSelector");
+  ISel->setupMF(MF, KB, CoverageInfo);
 
   // An optimization remark emitter. Used to report failures.
   MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr);
@@ -124,7 +132,7 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
         continue;
       }
 
-      if (!ISel->select(MI, CoverageInfo)) {
+      if (!ISel->select(MI)) {
         // FIXME: It would be nice to dump all inserted instructions.  It's
         // not obvious how, esp. considering select() can insert after MI.
         reportGISelFailure(MF, TPC, MORE, "gisel-select", "cannot select", MI);
@@ -159,10 +167,10 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
         --MII;
       if (MI.getOpcode() != TargetOpcode::COPY)
         continue;
-      unsigned SrcReg = MI.getOperand(1).getReg();
-      unsigned DstReg = MI.getOperand(0).getReg();
-      if (TargetRegisterInfo::isVirtualRegister(SrcReg) &&
-          TargetRegisterInfo::isVirtualRegister(DstReg)) {
+      Register SrcReg = MI.getOperand(1).getReg();
+      Register DstReg = MI.getOperand(0).getReg();
+      if (Register::isVirtualRegister(SrcReg) &&
+          Register::isVirtualRegister(DstReg)) {
         auto SrcRC = MRI.getRegClass(SrcReg);
         auto DstRC = MRI.getRegClass(DstReg);
         if (SrcRC == DstRC) {
@@ -179,7 +187,7 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
   // that the size of the now-constrained vreg is unchanged and that it has a
   // register class.
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    unsigned VReg = TargetRegisterInfo::index2VirtReg(I);
+    unsigned VReg = Register::index2VirtReg(I);
 
     MachineInstr *MI = nullptr;
     if (!MRI.def_empty(VReg))
@@ -217,6 +225,22 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
   auto &TLI = *MF.getSubtarget().getTargetLowering();
   TLI.finalizeLowering(MF);
 
+  // Determine if there are any calls in this machine function. Ported from
+  // SelectionDAG.
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  for (const auto &MBB : MF) {
+    if (MFI.hasCalls() && MF.hasInlineAsm())
+      break;
+
+    for (const auto &MI : MBB) {
+      if ((MI.isCall() && !MI.isReturn()) || MI.isStackAligningInlineAsm())
+        MFI.setHasCalls(true);
+      if (MI.isInlineAsm())
+        MF.setHasInlineAsm(true);
+    }
+  }
+
+
   LLVM_DEBUG({
     dbgs() << "Rules covered by selecting function: " << MF.getName() << ":";
     for (auto RuleID : CoverageInfo.covered())
diff --git a/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/lib/CodeGen/GlobalISel/InstructionSelector.cpp
index 2ad35b3a72c9..28143b30d4e8 100644
--- a/lib/CodeGen/GlobalISel/InstructionSelector.cpp
+++ b/lib/CodeGen/GlobalISel/InstructionSelector.cpp
@@ -79,5 +79,5 @@ bool InstructionSelector::isObviouslySafeToFold(MachineInstr &MI,
     return true;
 
   return !MI.mayLoadOrStore() && !MI.mayRaiseFPException() &&
-         !MI.hasUnmodeledSideEffects() && empty(MI.implicit_operands());
+         !MI.hasUnmodeledSideEffects() && MI.implicit_operands().empty();
 }
diff --git a/lib/CodeGen/GlobalISel/Legalizer.cpp b/lib/CodeGen/GlobalISel/Legalizer.cpp
index b5b26bff34bb..1593e21fe07e 100644
--- a/lib/CodeGen/GlobalISel/Legalizer.cpp
+++ b/lib/CodeGen/GlobalISel/Legalizer.cpp
@@ -184,11 +184,11 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
                        : TPC.isGISelCSEEnabled();
 
   if (EnableCSE) {
-    MIRBuilder = make_unique<CSEMIRBuilder>();
+    MIRBuilder = std::make_unique<CSEMIRBuilder>();
     CSEInfo = &Wrapper.get(TPC.getCSEConfig());
     MIRBuilder->setCSEInfo(CSEInfo);
   } else
-    MIRBuilder = make_unique<MachineIRBuilder>();
+    MIRBuilder = std::make_unique<MachineIRBuilder>();
   // This observer keeps the worklist updated.
   LegalizerWorkListManager WorkListObserver(InstList, ArtifactList);
   // We want both WorkListObserver as well as CSEInfo to observe all changes.
@@ -206,8 +206,16 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
   auto RemoveDeadInstFromLists = [&WrapperObserver](MachineInstr *DeadMI) {
     WrapperObserver.erasingInstr(*DeadMI);
   };
+  auto stopLegalizing = [&](MachineInstr &MI) {
+    Helper.MIRBuilder.stopObservingChanges();
+    reportGISelFailure(MF, TPC, MORE, "gisel-legalize",
+                       "unable to legalize instruction", MI);
+  };
   bool Changed = false;
+  SmallVector<MachineInstr *, 128> RetryList;
   do {
+    assert(RetryList.empty() && "Expected no instructions in RetryList");
+    unsigned NumArtifacts = ArtifactList.size();
     while (!InstList.empty()) {
       MachineInstr &MI = *InstList.pop_back_val();
       assert(isPreISelGenericOpcode(MI.getOpcode()) && "Expecting generic opcode");
@@ -222,14 +230,31 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
       // Error out if we couldn't legalize this instruction. We may want to
       // fall back to DAG ISel instead in the future.
       if (Res == LegalizerHelper::UnableToLegalize) {
-        Helper.MIRBuilder.stopObservingChanges();
-        reportGISelFailure(MF, TPC, MORE, "gisel-legalize",
-                           "unable to legalize instruction", MI);
+        // Move illegal artifacts to RetryList instead of aborting because
+        // legalizing InstList may generate artifacts that allow
+        // ArtifactCombiner to combine away them.
+        if (isArtifact(MI)) {
+          RetryList.push_back(&MI);
+          continue;
+        }
+        stopLegalizing(MI);
         return false;
       }
       WorkListObserver.printNewInstrs();
       Changed |= Res == LegalizerHelper::Legalized;
     }
+    // Try to combine the instructions in RetryList again if there
+    // are new artifacts. If not, stop legalizing.
+    if (!RetryList.empty()) {
+      if (ArtifactList.size() > NumArtifacts) {
+        while (!RetryList.empty())
+          ArtifactList.insert(RetryList.pop_back_val());
+      } else {
+        MachineInstr *MI = *RetryList.begin();
+        stopLegalizing(*MI);
+        return false;
+      }
+    }
     while (!ArtifactList.empty()) {
       MachineInstr &MI = *ArtifactList.pop_back_val();
       assert(isPreISelGenericOpcode(MI.getOpcode()) && "Expecting generic opcode");
diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index f5cf7fc9bd9b..21512e543878 100644
--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -171,6 +172,26 @@ bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
   return true;
 }
 
+static LLT getGCDType(LLT OrigTy, LLT TargetTy) {
+  if (OrigTy.isVector() && TargetTy.isVector()) {
+    assert(OrigTy.getElementType() == TargetTy.getElementType());
+    int GCD = greatestCommonDivisor(OrigTy.getNumElements(),
+                                    TargetTy.getNumElements());
+    return LLT::scalarOrVector(GCD, OrigTy.getElementType());
+  }
+
+  if (OrigTy.isVector() && !TargetTy.isVector()) {
+    assert(OrigTy.getElementType() == TargetTy);
+    return TargetTy;
+  }
+
+  assert(!OrigTy.isVector() && !TargetTy.isVector());
+
+  int GCD = greatestCommonDivisor(OrigTy.getSizeInBits(),
+                                  TargetTy.getSizeInBits());
+  return LLT::scalar(GCD);
+}
+
 void LegalizerHelper::insertParts(Register DstReg,
                                   LLT ResultTy, LLT PartTy,
                                   ArrayRef<Register> PartRegs,
@@ -219,11 +240,29 @@ void LegalizerHelper::insertParts(Register DstReg,
 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
   switch (Opcode) {
   case TargetOpcode::G_SDIV:
-    assert((Size == 32 || Size == 64) && "Unsupported size");
-    return Size == 64 ? RTLIB::SDIV_I64 : RTLIB::SDIV_I32;
+    assert((Size == 32 || Size == 64 || Size == 128) && "Unsupported size");
+    switch (Size) {
+    case 32:
+      return RTLIB::SDIV_I32;
+    case 64:
+      return RTLIB::SDIV_I64;
+    case 128:
+      return RTLIB::SDIV_I128;
+    default:
+      llvm_unreachable("unexpected size");
+    }
   case TargetOpcode::G_UDIV:
-    assert((Size == 32 || Size == 64) && "Unsupported size");
-    return Size == 64 ? RTLIB::UDIV_I64 : RTLIB::UDIV_I32;
+    assert((Size == 32 || Size == 64 || Size == 128) && "Unsupported size");
+    switch (Size) {
+    case 32:
+      return RTLIB::UDIV_I32;
+    case 64:
+      return RTLIB::UDIV_I64;
+    case 128:
+      return RTLIB::UDIV_I128;
+    default:
+      llvm_unreachable("unexpected size");
+    }
   case TargetOpcode::G_SREM:
     assert((Size == 32 || Size == 64) && "Unsupported size");
     return Size == 64 ? RTLIB::SREM_I64 : RTLIB::SREM_I32;
@@ -288,6 +327,35 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
   llvm_unreachable("Unknown libcall function");
 }
 
+/// True if an instruction is in tail position in its caller. Intended for
+/// legalizing libcalls as tail calls when possible.
+static bool isLibCallInTailPosition(MachineInstr &MI) {
+  const Function &F = MI.getParent()->getParent()->getFunction();
+
+  // Conservatively require the attributes of the call to match those of
+  // the return. Ignore NoAlias and NonNull because they don't affect the
+  // call sequence.
+  AttributeList CallerAttrs = F.getAttributes();
+  if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex)
+          .removeAttribute(Attribute::NoAlias)
+          .removeAttribute(Attribute::NonNull)
+          .hasAttributes())
+    return false;
+
+  // It's not safe to eliminate the sign / zero extension of the return value.
+  if (CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt) ||
+      CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
+    return false;
+
+  // Only tail call if the following instruction is a standard return.
+  auto &TII = *MI.getMF()->getSubtarget().getInstrInfo();
+  MachineInstr *Next = MI.getNextNode();
+  if (!Next || TII.isTailCall(*Next) || !Next->isReturn())
+    return false;
+
+  return true;
+}
+
 LegalizerHelper::LegalizeResult
 llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
                     const CallLowering::ArgInfo &Result,
@@ -296,9 +364,12 @@ llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
   auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
   const char *Name = TLI.getLibcallName(Libcall);
 
-  MIRBuilder.getMF().getFrameInfo().setHasCalls(true);
-  if (!CLI.lowerCall(MIRBuilder, TLI.getLibcallCallingConv(Libcall),
-                     MachineOperand::CreateES(Name), Result, Args))
+  CallLowering::CallLoweringInfo Info;
+  Info.CallConv = TLI.getLibcallCallingConv(Libcall);
+  Info.Callee = MachineOperand::CreateES(Name);
+  Info.OrigRet = Result;
+  std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
+  if (!CLI.lowerCall(MIRBuilder, Info))
     return LegalizerHelper::UnableToLegalize;
 
   return LegalizerHelper::Legalized;
@@ -317,6 +388,74 @@ simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
                        Args);
 }
 
+LegalizerHelper::LegalizeResult
+llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                       MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS);
+  auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
+
+  SmallVector<CallLowering::ArgInfo, 3> Args;
+  // Add all the args, except for the last which is an imm denoting 'tail'.
+  for (unsigned i = 1; i < MI.getNumOperands() - 1; i++) {
+    Register Reg = MI.getOperand(i).getReg();
+
+    // Need derive an IR type for call lowering.
+    LLT OpLLT = MRI.getType(Reg);
+    Type *OpTy = nullptr;
+    if (OpLLT.isPointer())
+      OpTy = Type::getInt8PtrTy(Ctx, OpLLT.getAddressSpace());
+    else
+      OpTy = IntegerType::get(Ctx, OpLLT.getSizeInBits());
+    Args.push_back({Reg, OpTy});
+  }
+
+  auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
+  auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
+  Intrinsic::ID ID = MI.getOperand(0).getIntrinsicID();
+  RTLIB::Libcall RTLibcall;
+  switch (ID) {
+  case Intrinsic::memcpy:
+    RTLibcall = RTLIB::MEMCPY;
+    break;
+  case Intrinsic::memset:
+    RTLibcall = RTLIB::MEMSET;
+    break;
+  case Intrinsic::memmove:
+    RTLibcall = RTLIB::MEMMOVE;
+    break;
+  default:
+    return LegalizerHelper::UnableToLegalize;
+  }
+  const char *Name = TLI.getLibcallName(RTLibcall);
+
+  MIRBuilder.setInstr(MI);
+
+  CallLowering::CallLoweringInfo Info;
+  Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
+  Info.Callee = MachineOperand::CreateES(Name);
+  Info.OrigRet = CallLowering::ArgInfo({0}, Type::getVoidTy(Ctx));
+  Info.IsTailCall = MI.getOperand(MI.getNumOperands() - 1).getImm() == 1 &&
+                    isLibCallInTailPosition(MI);
+
+  std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
+  if (!CLI.lowerCall(MIRBuilder, Info))
+    return LegalizerHelper::UnableToLegalize;
+
+  if (Info.LoweredTailCall) {
+    assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
+    // We must have a return following the call to get past
+    // isLibCallInTailPosition.
+    assert(MI.getNextNode() && MI.getNextNode()->isReturn() &&
+           "Expected instr following MI to be a return?");
+
+    // We lowered a tail call, so the call is now the return from the block.
+    // Delete the old return.
+    MI.getNextNode()->eraseFromParent();
+  }
+
+  return LegalizerHelper::Legalized;
+}
+
 static RTLIB::Libcall getConvRTLibDesc(unsigned Opcode, Type *ToType,
                                        Type *FromType) {
   auto ToMVT = MVT::getVT(ToType);
@@ -518,6 +657,65 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_SEXT: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    Register SrcReg = MI.getOperand(1).getReg();
+    LLT SrcTy = MRI.getType(SrcReg);
+
+    // FIXME: support the general case where the requested NarrowTy may not be
+    // the same as the source type. E.g. s128 = sext(s32)
+    if ((SrcTy.getSizeInBits() != SizeOp0 / 2) ||
+        SrcTy.getSizeInBits() != NarrowTy.getSizeInBits()) {
+      LLVM_DEBUG(dbgs() << "Can't narrow sext to type " << NarrowTy << "\n");
+      return UnableToLegalize;
+    }
+
+    // Shift the sign bit of the low register through the high register.
+    auto ShiftAmt =
+        MIRBuilder.buildConstant(LLT::scalar(64), NarrowTy.getSizeInBits() - 1);
+    auto Shift = MIRBuilder.buildAShr(NarrowTy, SrcReg, ShiftAmt);
+    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), {SrcReg, Shift.getReg(0)});
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_ZEXT: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+    uint64_t SizeOp1 = SrcTy.getSizeInBits();
+    if (SizeOp0 % SizeOp1 != 0)
+      return UnableToLegalize;
+
+    // Generate a merge where the bottom bits are taken from the source, and
+    // zero everything else.
+    Register ZeroReg = MIRBuilder.buildConstant(SrcTy, 0).getReg(0);
+    unsigned NumParts = SizeOp0 / SizeOp1;
+    SmallVector<Register, 4> Srcs = {MI.getOperand(1).getReg()};
+    for (unsigned Part = 1; Part < NumParts; ++Part)
+      Srcs.push_back(ZeroReg);
+    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), Srcs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_TRUNC: {
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+
+    uint64_t SizeOp1 = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+    if (NarrowTy.getSizeInBits() * 2 != SizeOp1) {
+      LLVM_DEBUG(dbgs() << "Can't narrow trunc to type " << NarrowTy << "\n");
+      return UnableToLegalize;
+    }
+
+    auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg());
+    MIRBuilder.buildCopy(MI.getOperand(0).getReg(), Unmerge.getReg(0));
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
   case TargetOpcode::G_ADD: {
     // FIXME: add support for when SizeOp0 isn't an exact multiple of
     // NarrowSize.
@@ -530,15 +728,17 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs);
     extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs);
 
-    Register CarryIn = MRI.createGenericVirtualRegister(LLT::scalar(1));
-    MIRBuilder.buildConstant(CarryIn, 0);
-
+    Register CarryIn;
     for (int i = 0; i < NumParts; ++i) {
       Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
       Register CarryOut = MRI.createGenericVirtualRegister(LLT::scalar(1));
 
-      MIRBuilder.buildUAdde(DstReg, CarryOut, Src1Regs[i],
-                            Src2Regs[i], CarryIn);
+      if (i == 0)
+        MIRBuilder.buildUAddo(DstReg, CarryOut, Src1Regs[i], Src2Regs[i]);
+      else {
+        MIRBuilder.buildUAdde(DstReg, CarryOut, Src1Regs[i],
+                              Src2Regs[i], CarryIn);
+      }
 
       DstRegs.push_back(DstReg);
       CarryIn = CarryOut;
@@ -730,7 +930,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       for (unsigned j = 1; j < MI.getNumOperands(); j += 2)
         MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
     }
-    MIRBuilder.setInsertPt(MBB, --MBB.getFirstNonPHI());
+    MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
     MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
     Observer.changedInstr(MI);
     MI.eraseFromParent();
@@ -763,6 +963,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
 
     CmpInst::Predicate Pred =
         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+    LLT ResTy = MRI.getType(MI.getOperand(0).getReg());
 
     if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) {
       MachineInstrBuilder XorL = MIRBuilder.buildXor(NarrowTy, LHSL, RHSL);
@@ -771,18 +972,109 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       MachineInstrBuilder Zero = MIRBuilder.buildConstant(NarrowTy, 0);
       MIRBuilder.buildICmp(Pred, MI.getOperand(0).getReg(), Or, Zero);
     } else {
-      const LLT s1 = LLT::scalar(1);
-      MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, s1, LHSH, RHSH);
+      MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
       MachineInstrBuilder CmpHEQ =
-          MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, s1, LHSH, RHSH);
+          MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
       MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
-          ICmpInst::getUnsignedPredicate(Pred), s1, LHSL, RHSL);
+          ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
       MIRBuilder.buildSelect(MI.getOperand(0).getReg(), CmpHEQ, CmpLU, CmpH);
     }
     Observer.changedInstr(MI);
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_SEXT_INREG: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    if (!MI.getOperand(2).isImm())
+      return UnableToLegalize;
+    int64_t SizeInBits = MI.getOperand(2).getImm();
+
+    // So long as the new type has more bits than the bits we're extending we
+    // don't need to break it apart.
+    if (NarrowTy.getScalarSizeInBits() >= SizeInBits) {
+      Observer.changingInstr(MI);
+      // We don't lose any non-extension bits by truncating the src and
+      // sign-extending the dst.
+      MachineOperand &MO1 = MI.getOperand(1);
+      auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1.getReg());
+      MO1.setReg(TruncMIB->getOperand(0).getReg());
+
+      MachineOperand &MO2 = MI.getOperand(0);
+      Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
+      MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+      MIRBuilder.buildInstr(TargetOpcode::G_SEXT, {MO2.getReg()}, {DstExt});
+      MO2.setReg(DstExt);
+      Observer.changedInstr(MI);
+      return Legalized;
+    }
+
+    // Break it apart. Components below the extension point are unmodified. The
+    // component containing the extension point becomes a narrower SEXT_INREG.
+    // Components above it are ashr'd from the component containing the
+    // extension point.
+    if (SizeOp0 % NarrowSize != 0)
+      return UnableToLegalize;
+    int NumParts = SizeOp0 / NarrowSize;
+
+    // List the registers where the destination will be scattered.
+    SmallVector<Register, 2> DstRegs;
+    // List the registers where the source will be split.
+    SmallVector<Register, 2> SrcRegs;
+
+    // Create all the temporary registers.
+    for (int i = 0; i < NumParts; ++i) {
+      Register SrcReg = MRI.createGenericVirtualRegister(NarrowTy);
+
+      SrcRegs.push_back(SrcReg);
+    }
+
+    // Explode the big arguments into smaller chunks.
+    MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1).getReg());
+
+    Register AshrCstReg =
+        MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
+            ->getOperand(0)
+            .getReg();
+    Register FullExtensionReg = 0;
+    Register PartialExtensionReg = 0;
+
+    // Do the operation on each small part.
+    for (int i = 0; i < NumParts; ++i) {
+      if ((i + 1) * NarrowTy.getScalarSizeInBits() < SizeInBits)
+        DstRegs.push_back(SrcRegs[i]);
+      else if (i * NarrowTy.getScalarSizeInBits() > SizeInBits) {
+        assert(PartialExtensionReg &&
+               "Expected to visit partial extension before full");
+        if (FullExtensionReg) {
+          DstRegs.push_back(FullExtensionReg);
+          continue;
+        }
+        DstRegs.push_back(MIRBuilder
+                              .buildInstr(TargetOpcode::G_ASHR, {NarrowTy},
+                                          {PartialExtensionReg, AshrCstReg})
+                              ->getOperand(0)
+                              .getReg());
+        FullExtensionReg = DstRegs.back();
+      } else {
+        DstRegs.push_back(
+            MIRBuilder
+                .buildInstr(
+                    TargetOpcode::G_SEXT_INREG, {NarrowTy},
+                    {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
+                ->getOperand(0)
+                .getReg());
+        PartialExtensionReg = DstRegs.back();
+      }
+    }
+
+    // Gather the destination registers into the final destination.
+    Register DstReg = MI.getOperand(0).getReg();
+    MIRBuilder.buildMerge(DstReg, DstRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   }
 }
 
@@ -892,7 +1184,7 @@ LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
 
       auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
 
-      Register NextResult = I + 1 == NumOps && WideSize == DstSize ? DstReg :
+      Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
         MRI.createGenericVirtualRegister(WideTy);
 
       auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
@@ -903,6 +1195,8 @@ LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
 
     if (WideSize > DstSize)
       MIRBuilder.buildTrunc(DstReg, ResultReg);
+    else if (DstTy.isPointer())
+      MIRBuilder.buildIntToPtr(DstReg, ResultReg);
 
     MI.eraseFromParent();
     return Legalized;
@@ -1218,6 +1512,24 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     Observer.changedInstr(MI);
     return Legalized;
   }
+  case TargetOpcode::G_BITREVERSE: {
+    Observer.changingInstr(MI);
+
+    Register DstReg = MI.getOperand(0).getReg();
+    LLT Ty = MRI.getType(DstReg);
+    unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
+
+    Register DstExt = MRI.createGenericVirtualRegister(WideTy);
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+    MI.getOperand(0).setReg(DstExt);
+    MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+
+    auto ShiftAmt = MIRBuilder.buildConstant(WideTy, DiffBits);
+    auto Shift = MIRBuilder.buildLShr(WideTy, DstExt, ShiftAmt);
+    MIRBuilder.buildTrunc(DstReg, Shift);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
   case TargetOpcode::G_ADD:
   case TargetOpcode::G_AND:
   case TargetOpcode::G_MUL:
@@ -1310,13 +1622,15 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
 
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI:
-    if (TypeIdx != 0)
-      return UnableToLegalize;
     Observer.changingInstr(MI);
-    widenScalarDst(MI, WideTy);
+
+    if (TypeIdx == 0)
+      widenScalarDst(MI, WideTy);
+    else
+      widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_FPEXT);
+
     Observer.changedInstr(MI);
     return Legalized;
-
   case TargetOpcode::G_SITOFP:
     if (TypeIdx != 1)
       return UnableToLegalize;
@@ -1483,6 +1797,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   case TargetOpcode::G_FMUL:
   case TargetOpcode::G_FSUB:
   case TargetOpcode::G_FMA:
+  case TargetOpcode::G_FMAD:
   case TargetOpcode::G_FNEG:
   case TargetOpcode::G_FABS:
   case TargetOpcode::G_FCANONICALIZE:
@@ -1553,6 +1868,15 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     Observer.changedInstr(MI);
     return Legalized;
   }
+  case TargetOpcode::G_SEXT_INREG:
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+    widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
+    Observer.changedInstr(MI);
+    return Legalized;
   }
 }
 
@@ -1579,6 +1903,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_SADDO:
+  case TargetOpcode::G_SSUBO:
+    return lowerSADDO_SSUBO(MI);
   case TargetOpcode::G_SMULO:
   case TargetOpcode::G_UMULO: {
     // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
@@ -1669,6 +1996,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_FMAD:
+    return lowerFMad(MI);
   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
     Register OldValRes = MI.getOperand(0).getReg();
     Register SuccessRes = MI.getOperand(1).getReg();
@@ -1690,11 +2019,57 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     LLT DstTy = MRI.getType(DstReg);
     auto &MMO = **MI.memoperands_begin();
 
-    if (DstTy.getSizeInBits() == MMO.getSize() /* in bytes */ * 8) {
-      // In the case of G_LOAD, this was a non-extending load already and we're
-      // about to lower to the same instruction.
-      if (MI.getOpcode() == TargetOpcode::G_LOAD)
+    if (DstTy.getSizeInBits() == MMO.getSizeInBits()) {
+      if (MI.getOpcode() == TargetOpcode::G_LOAD) {
+        // This load needs splitting into power of 2 sized loads.
+        if (DstTy.isVector())
           return UnableToLegalize;
+        if (isPowerOf2_32(DstTy.getSizeInBits()))
+          return UnableToLegalize; // Don't know what we're being asked to do.
+
+        // Our strategy here is to generate anyextending loads for the smaller
+        // types up to next power-2 result type, and then combine the two larger
+        // result values together, before truncating back down to the non-pow-2
+        // type.
+        // E.g. v1 = i24 load =>
+        // v2 = i32 load (2 byte)
+        // v3 = i32 load (1 byte)
+        // v4 = i32 shl v3, 16
+        // v5 = i32 or v4, v2
+        // v1 = i24 trunc v5
+        // By doing this we generate the correct truncate which should get
+        // combined away as an artifact with a matching extend.
+        uint64_t LargeSplitSize = PowerOf2Floor(DstTy.getSizeInBits());
+        uint64_t SmallSplitSize = DstTy.getSizeInBits() - LargeSplitSize;
+
+        MachineFunction &MF = MIRBuilder.getMF();
+        MachineMemOperand *LargeMMO =
+            MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
+        MachineMemOperand *SmallMMO = MF.getMachineMemOperand(
+            &MMO, LargeSplitSize / 8, SmallSplitSize / 8);
+
+        LLT PtrTy = MRI.getType(PtrReg);
+        unsigned AnyExtSize = NextPowerOf2(DstTy.getSizeInBits());
+        LLT AnyExtTy = LLT::scalar(AnyExtSize);
+        Register LargeLdReg = MRI.createGenericVirtualRegister(AnyExtTy);
+        Register SmallLdReg = MRI.createGenericVirtualRegister(AnyExtTy);
+        auto LargeLoad =
+            MIRBuilder.buildLoad(LargeLdReg, PtrReg, *LargeMMO);
+
+        auto OffsetCst =
+            MIRBuilder.buildConstant(LLT::scalar(64), LargeSplitSize / 8);
+        Register GEPReg = MRI.createGenericVirtualRegister(PtrTy);
+        auto SmallPtr = MIRBuilder.buildGEP(GEPReg, PtrReg, OffsetCst.getReg(0));
+        auto SmallLoad = MIRBuilder.buildLoad(SmallLdReg, SmallPtr.getReg(0),
+                                              *SmallMMO);
+
+        auto ShiftAmt = MIRBuilder.buildConstant(AnyExtTy, LargeSplitSize);
+        auto Shift = MIRBuilder.buildShl(AnyExtTy, SmallLoad, ShiftAmt);
+        auto Or = MIRBuilder.buildOr(AnyExtTy, Shift, LargeLoad);
+        MIRBuilder.buildTrunc(DstReg, {Or.getReg(0)});
+        MI.eraseFromParent();
+        return Legalized;
+      }
       MIRBuilder.buildLoad(DstReg, PtrReg, MMO);
       MI.eraseFromParent();
       return Legalized;
@@ -1723,6 +2098,51 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
 
     return UnableToLegalize;
   }
+  case TargetOpcode::G_STORE: {
+    // Lower a non-power of 2 store into multiple pow-2 stores.
+    // E.g. split an i24 store into an i16 store + i8 store.
+    // We do this by first extending the stored value to the next largest power
+    // of 2 type, and then using truncating stores to store the components.
+    // By doing this, likewise with G_LOAD, generate an extend that can be
+    // artifact-combined away instead of leaving behind extracts.
+    Register SrcReg = MI.getOperand(0).getReg();
+    Register PtrReg = MI.getOperand(1).getReg();
+    LLT SrcTy = MRI.getType(SrcReg);
+    MachineMemOperand &MMO = **MI.memoperands_begin();
+    if (SrcTy.getSizeInBits() != MMO.getSizeInBits())
+      return UnableToLegalize;
+    if (SrcTy.isVector())
+      return UnableToLegalize;
+    if (isPowerOf2_32(SrcTy.getSizeInBits()))
+      return UnableToLegalize; // Don't know what we're being asked to do.
+
+    // Extend to the next pow-2.
+    const LLT ExtendTy = LLT::scalar(NextPowerOf2(SrcTy.getSizeInBits()));
+    auto ExtVal = MIRBuilder.buildAnyExt(ExtendTy, SrcReg);
+
+    // Obtain the smaller value by shifting away the larger value.
+    uint64_t LargeSplitSize = PowerOf2Floor(SrcTy.getSizeInBits());
+    uint64_t SmallSplitSize = SrcTy.getSizeInBits() - LargeSplitSize;
+    auto ShiftAmt = MIRBuilder.buildConstant(ExtendTy, LargeSplitSize);
+    auto SmallVal = MIRBuilder.buildLShr(ExtendTy, ExtVal, ShiftAmt);
+
+    // Generate the GEP and truncating stores.
+    LLT PtrTy = MRI.getType(PtrReg);
+    auto OffsetCst =
+        MIRBuilder.buildConstant(LLT::scalar(64), LargeSplitSize / 8);
+    Register GEPReg = MRI.createGenericVirtualRegister(PtrTy);
+    auto SmallPtr = MIRBuilder.buildGEP(GEPReg, PtrReg, OffsetCst.getReg(0));
+
+    MachineFunction &MF = MIRBuilder.getMF();
+    MachineMemOperand *LargeMMO =
+        MF.getMachineMemOperand(&MMO, 0, LargeSplitSize / 8);
+    MachineMemOperand *SmallMMO =
+        MF.getMachineMemOperand(&MMO, LargeSplitSize / 8, SmallSplitSize / 8);
+    MIRBuilder.buildStore(ExtVal.getReg(0), PtrReg, *LargeMMO);
+    MIRBuilder.buildStore(SmallVal.getReg(0), SmallPtr.getReg(0), *SmallMMO);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
   case TargetOpcode::G_CTLZ:
@@ -1797,6 +2217,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     return lowerUITOFP(MI, TypeIdx, Ty);
   case G_SITOFP:
     return lowerSITOFP(MI, TypeIdx, Ty);
+  case G_FPTOUI:
+    return lowerFPTOUI(MI, TypeIdx, Ty);
   case G_SMIN:
   case G_SMAX:
   case G_UMIN:
@@ -1807,6 +2229,31 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   case G_FMINNUM:
   case G_FMAXNUM:
     return lowerFMinNumMaxNum(MI);
+  case G_UNMERGE_VALUES:
+    return lowerUnmergeValues(MI);
+  case TargetOpcode::G_SEXT_INREG: {
+    assert(MI.getOperand(2).isImm() && "Expected immediate");
+    int64_t SizeInBits = MI.getOperand(2).getImm();
+
+    Register DstReg = MI.getOperand(0).getReg();
+    Register SrcReg = MI.getOperand(1).getReg();
+    LLT DstTy = MRI.getType(DstReg);
+    Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
+
+    auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
+    MIRBuilder.buildInstr(TargetOpcode::G_SHL, {TmpRes}, {SrcReg, MIBSz->getOperand(0).getReg()});
+    MIRBuilder.buildInstr(TargetOpcode::G_ASHR, {DstReg}, {TmpRes, MIBSz->getOperand(0).getReg()});
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case G_SHUFFLE_VECTOR:
+    return lowerShuffleVector(MI);
+  case G_DYN_STACKALLOC:
+    return lowerDynStackAlloc(MI);
+  case G_EXTRACT:
+    return lowerExtract(MI);
+  case G_INSERT:
+    return lowerInsert(MI);
   }
 }
 
@@ -2283,6 +2730,105 @@ LegalizerHelper::fewerElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
 }
 
 LegalizerHelper::LegalizeResult
+LegalizerHelper::fewerElementsVectorUnmergeValues(MachineInstr &MI,
+                                                  unsigned TypeIdx,
+                                                  LLT NarrowTy) {
+  if (TypeIdx != 1)
+    return UnableToLegalize;
+
+  const int NumDst = MI.getNumOperands() - 1;
+  const Register SrcReg = MI.getOperand(NumDst).getReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+
+  LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+
+  // TODO: Create sequence of extracts.
+  if (DstTy == NarrowTy)
+    return UnableToLegalize;
+
+  LLT GCDTy = getGCDType(SrcTy, NarrowTy);
+  if (DstTy == GCDTy) {
+    // This would just be a copy of the same unmerge.
+    // TODO: Create extracts, pad with undef and create intermediate merges.
+    return UnableToLegalize;
+  }
+
+  auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
+  const int NumUnmerge = Unmerge->getNumOperands() - 1;
+  const int PartsPerUnmerge = NumDst / NumUnmerge;
+
+  for (int I = 0; I != NumUnmerge; ++I) {
+    auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
+
+    for (int J = 0; J != PartsPerUnmerge; ++J)
+      MIB.addDef(MI.getOperand(I * PartsPerUnmerge + J).getReg());
+    MIB.addUse(Unmerge.getReg(I));
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::fewerElementsVectorBuildVector(MachineInstr &MI,
+                                                unsigned TypeIdx,
+                                                LLT NarrowTy) {
+  assert(TypeIdx == 0 && "not a vector type index");
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  LLT SrcTy = DstTy.getElementType();
+
+  int DstNumElts = DstTy.getNumElements();
+  int NarrowNumElts = NarrowTy.getNumElements();
+  int NumConcat = (DstNumElts + NarrowNumElts - 1) / NarrowNumElts;
+  LLT WidenedDstTy = LLT::vector(NarrowNumElts * NumConcat, SrcTy);
+
+  SmallVector<Register, 8> ConcatOps;
+  SmallVector<Register, 8> SubBuildVector;
+
+  Register UndefReg;
+  if (WidenedDstTy != DstTy)
+    UndefReg = MIRBuilder.buildUndef(SrcTy).getReg(0);
+
+  // Create a G_CONCAT_VECTORS of NarrowTy pieces, padding with undef as
+  // necessary.
+  //
+  // %3:_(<3 x s16>) = G_BUILD_VECTOR %0, %1, %2
+  //   -> <2 x s16>
+  //
+  // %4:_(s16) = G_IMPLICIT_DEF
+  // %5:_(<2 x s16>) = G_BUILD_VECTOR %0, %1
+  // %6:_(<2 x s16>) = G_BUILD_VECTOR %2, %4
+  // %7:_(<4 x s16>) = G_CONCAT_VECTORS %5, %6
+  // %3:_(<3 x s16>) = G_EXTRACT %7, 0
+  for (int I = 0; I != NumConcat; ++I) {
+    for (int J = 0; J != NarrowNumElts; ++J) {
+      int SrcIdx = NarrowNumElts * I + J;
+
+      if (SrcIdx < DstNumElts) {
+        Register SrcReg = MI.getOperand(SrcIdx + 1).getReg();
+        SubBuildVector.push_back(SrcReg);
+      } else
+        SubBuildVector.push_back(UndefReg);
+    }
+
+    auto BuildVec = MIRBuilder.buildBuildVector(NarrowTy, SubBuildVector);
+    ConcatOps.push_back(BuildVec.getReg(0));
+    SubBuildVector.clear();
+  }
+
+  if (DstTy == WidenedDstTy)
+    MIRBuilder.buildConcatVectors(DstReg, ConcatOps);
+  else {
+    auto Concat = MIRBuilder.buildConcatVectors(WidenedDstTy, ConcatOps);
+    MIRBuilder.buildExtract(DstReg, Concat, 0);
+  }
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
 LegalizerHelper::reduceLoadStoreWidth(MachineInstr &MI, unsigned TypeIdx,
                                       LLT NarrowTy) {
   // FIXME: Don't know how to handle secondary types yet.
@@ -2395,6 +2941,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_FDIV:
   case G_FREM:
   case G_FMA:
+  case G_FMAD:
   case G_FPOW:
   case G_FEXP:
   case G_FEXP2:
@@ -2411,6 +2958,7 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_FSIN:
   case G_FSQRT:
   case G_BSWAP:
+  case G_BITREVERSE:
   case G_SDIV:
   case G_SMIN:
   case G_SMAX:
@@ -2453,6 +3001,10 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
     return fewerElementsVectorSelect(MI, TypeIdx, NarrowTy);
   case G_PHI:
     return fewerElementsVectorPhi(MI, TypeIdx, NarrowTy);
+  case G_UNMERGE_VALUES:
+    return fewerElementsVectorUnmergeValues(MI, TypeIdx, NarrowTy);
+  case G_BUILD_VECTOR:
+    return fewerElementsVectorBuildVector(MI, TypeIdx, NarrowTy);
   case G_LOAD:
   case G_STORE:
     return reduceLoadStoreWidth(MI, TypeIdx, NarrowTy);
@@ -2604,11 +3156,11 @@ LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
   switch (MI.getOpcode()) {
   case TargetOpcode::G_SHL: {
     // Short: ShAmt < NewBitSize
-    auto LoS = MIRBuilder.buildShl(HalfTy, InH, Amt);
+    auto LoS = MIRBuilder.buildShl(HalfTy, InL, Amt);
 
-    auto OrLHS = MIRBuilder.buildShl(HalfTy, InH, Amt);
-    auto OrRHS = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
-    auto HiS = MIRBuilder.buildOr(HalfTy, OrLHS, OrRHS);
+    auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, AmtLack);
+    auto HiOr = MIRBuilder.buildShl(HalfTy, InH, Amt);
+    auto HiS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
 
     // Long: ShAmt >= NewBitSize
     auto LoL = MIRBuilder.buildConstant(HalfTy, 0);         // Lo part is zero.
@@ -2622,41 +3174,25 @@ LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
     ResultRegs[1] = Hi.getReg(0);
     break;
   }
-  case TargetOpcode::G_LSHR: {
-    // Short: ShAmt < NewBitSize
-    auto HiS = MIRBuilder.buildLShr(HalfTy, InH, Amt);
-
-    auto OrLHS = MIRBuilder.buildLShr(HalfTy, InL, Amt);
-    auto OrRHS = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
-    auto LoS = MIRBuilder.buildOr(HalfTy, OrLHS, OrRHS);
-
-    // Long: ShAmt >= NewBitSize
-    auto HiL = MIRBuilder.buildConstant(HalfTy, 0);          // Hi part is zero.
-    auto LoL = MIRBuilder.buildLShr(HalfTy, InH, AmtExcess); // Lo from Hi part.
-
-    auto Lo = MIRBuilder.buildSelect(
-        HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
-    auto Hi = MIRBuilder.buildSelect(HalfTy, IsShort, HiS, HiL);
-
-    ResultRegs[0] = Lo.getReg(0);
-    ResultRegs[1] = Hi.getReg(0);
-    break;
-  }
+  case TargetOpcode::G_LSHR:
   case TargetOpcode::G_ASHR: {
     // Short: ShAmt < NewBitSize
-    auto HiS = MIRBuilder.buildAShr(HalfTy, InH, Amt);
+    auto HiS = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy}, {InH, Amt});
 
-    auto OrLHS = MIRBuilder.buildLShr(HalfTy, InL, Amt);
-    auto OrRHS = MIRBuilder.buildLShr(HalfTy, InH, AmtLack);
-    auto LoS = MIRBuilder.buildOr(HalfTy, OrLHS, OrRHS);
+    auto LoOr = MIRBuilder.buildLShr(HalfTy, InL, Amt);
+    auto HiOr = MIRBuilder.buildShl(HalfTy, InH, AmtLack);
+    auto LoS = MIRBuilder.buildOr(HalfTy, LoOr, HiOr);
 
     // Long: ShAmt >= NewBitSize
-
-    // Sign of Hi part.
-    auto HiL = MIRBuilder.buildAShr(
-        HalfTy, InH, MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1));
-
-    auto LoL = MIRBuilder.buildAShr(HalfTy, InH, AmtExcess); // Lo from Hi part.
+    MachineInstrBuilder HiL;
+    if (MI.getOpcode() == TargetOpcode::G_LSHR) {
+      HiL = MIRBuilder.buildConstant(HalfTy, 0);            // Hi part is zero.
+    } else {
+      auto ShiftAmt = MIRBuilder.buildConstant(ShiftAmtTy, NewBitSize - 1);
+      HiL = MIRBuilder.buildAShr(HalfTy, InH, ShiftAmt);    // Sign of Hi part.
+    }
+    auto LoL = MIRBuilder.buildInstr(MI.getOpcode(), {HalfTy},
+                                     {InH, AmtExcess});     // Lo from Hi part.
 
     auto Lo = MIRBuilder.buildSelect(
         HalfTy, IsZero, InL, MIRBuilder.buildSelect(HalfTy, IsShort, LoS, LoL));
@@ -2701,12 +3237,22 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
   MIRBuilder.setInstr(MI);
   unsigned Opc = MI.getOpcode();
   switch (Opc) {
-  case TargetOpcode::G_IMPLICIT_DEF: {
+  case TargetOpcode::G_IMPLICIT_DEF:
+  case TargetOpcode::G_LOAD: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
     Observer.changingInstr(MI);
     moreElementsVectorDst(MI, MoreTy, 0);
     Observer.changedInstr(MI);
     return Legalized;
   }
+  case TargetOpcode::G_STORE:
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+    Observer.changingInstr(MI);
+    moreElementsVectorSrc(MI, MoreTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
   case TargetOpcode::G_AND:
   case TargetOpcode::G_OR:
   case TargetOpcode::G_XOR:
@@ -2748,6 +3294,26 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
     moreElementsVectorDst(MI, MoreTy, 0);
     Observer.changedInstr(MI);
     return Legalized;
+  case TargetOpcode::G_UNMERGE_VALUES: {
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+
+    LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+    int NumDst = MI.getNumOperands() - 1;
+    moreElementsVectorSrc(MI, MoreTy, NumDst);
+
+    auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
+    for (int I = 0; I != NumDst; ++I)
+      MIB.addDef(MI.getOperand(I).getReg());
+
+    int NewNumDst = MoreTy.getSizeInBits() / DstTy.getSizeInBits();
+    for (int I = NumDst; I != NewNumDst; ++I)
+      MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
+
+    MIB.addUse(MI.getOperand(NumDst).getReg());
+    MI.eraseFromParent();
+    return Legalized;
+  }
   case TargetOpcode::G_PHI:
     return moreElementsVectorPhi(MI, TypeIdx, MoreTy);
   default:
@@ -3310,6 +3876,48 @@ LegalizerHelper::lowerSITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   return UnableToLegalize;
 }
 
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFPTOUI(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
+  const LLT S64 = LLT::scalar(64);
+  const LLT S32 = LLT::scalar(32);
+
+  if (SrcTy != S64 && SrcTy != S32)
+    return UnableToLegalize;
+  if (DstTy != S32 && DstTy != S64)
+    return UnableToLegalize;
+
+  // FPTOSI gives same result as FPTOUI for positive signed integers.
+  // FPTOUI needs to deal with fp values that convert to unsigned integers
+  // greater or equal to 2^31 for float or 2^63 for double. For brevity 2^Exp.
+
+  APInt TwoPExpInt = APInt::getSignMask(DstTy.getSizeInBits());
+  APFloat TwoPExpFP(SrcTy.getSizeInBits() == 32 ? APFloat::IEEEsingle()
+                                                : APFloat::IEEEdouble(),
+                    APInt::getNullValue(SrcTy.getSizeInBits()));
+  TwoPExpFP.convertFromAPInt(TwoPExpInt, false, APFloat::rmNearestTiesToEven);
+
+  MachineInstrBuilder FPTOSI = MIRBuilder.buildFPTOSI(DstTy, Src);
+
+  MachineInstrBuilder Threshold = MIRBuilder.buildFConstant(SrcTy, TwoPExpFP);
+  // For fp Value greater or equal to Threshold(2^Exp), we use FPTOSI on
+  // (Value - 2^Exp) and add 2^Exp by setting highest bit in result to 1.
+  MachineInstrBuilder FSub = MIRBuilder.buildFSub(SrcTy, Src, Threshold);
+  MachineInstrBuilder ResLowBits = MIRBuilder.buildFPTOSI(DstTy, FSub);
+  MachineInstrBuilder ResHighBit = MIRBuilder.buildConstant(DstTy, TwoPExpInt);
+  MachineInstrBuilder Res = MIRBuilder.buildXor(DstTy, ResLowBits, ResHighBit);
+
+  MachineInstrBuilder FCMP =
+      MIRBuilder.buildFCmp(CmpInst::FCMP_ULT, DstTy, Src, Threshold);
+  MIRBuilder.buildSelect(Dst, FCMP, FPTOSI, Res);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
   switch (Opc) {
   case TargetOpcode::G_SMIN:
@@ -3419,3 +4027,251 @@ LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
   MI.eraseFromParent();
   return Legalized;
 }
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
+  // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DstReg);
+  unsigned Flags = MI.getFlags();
+
+  auto Mul = MIRBuilder.buildFMul(Ty, MI.getOperand(1), MI.getOperand(2),
+                                  Flags);
+  MIRBuilder.buildFAdd(DstReg, Mul, MI.getOperand(3), Flags);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
+  const unsigned NumDst = MI.getNumOperands() - 1;
+  const Register SrcReg = MI.getOperand(NumDst).getReg();
+  LLT SrcTy = MRI.getType(SrcReg);
+
+  Register Dst0Reg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(Dst0Reg);
+
+
+  // Expand scalarizing unmerge as bitcast to integer and shift.
+  if (!DstTy.isVector() && SrcTy.isVector() &&
+      SrcTy.getElementType() == DstTy) {
+    LLT IntTy = LLT::scalar(SrcTy.getSizeInBits());
+    Register Cast = MIRBuilder.buildBitcast(IntTy, SrcReg).getReg(0);
+
+    MIRBuilder.buildTrunc(Dst0Reg, Cast);
+
+    const unsigned DstSize = DstTy.getSizeInBits();
+    unsigned Offset = DstSize;
+    for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
+      auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
+      auto Shift = MIRBuilder.buildLShr(IntTy, Cast, ShiftAmt);
+      MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
+    }
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Src0Reg = MI.getOperand(1).getReg();
+  Register Src1Reg = MI.getOperand(2).getReg();
+  LLT Src0Ty = MRI.getType(Src0Reg);
+  LLT DstTy = MRI.getType(DstReg);
+  LLT IdxTy = LLT::scalar(32);
+
+  const Constant *ShufMask = MI.getOperand(3).getShuffleMask();
+
+  SmallVector<int, 32> Mask;
+  ShuffleVectorInst::getShuffleMask(ShufMask, Mask);
+
+  if (DstTy.isScalar()) {
+    if (Src0Ty.isVector())
+      return UnableToLegalize;
+
+    // This is just a SELECT.
+    assert(Mask.size() == 1 && "Expected a single mask element");
+    Register Val;
+    if (Mask[0] < 0 || Mask[0] > 1)
+      Val = MIRBuilder.buildUndef(DstTy).getReg(0);
+    else
+      Val = Mask[0] == 0 ? Src0Reg : Src1Reg;
+    MIRBuilder.buildCopy(DstReg, Val);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  Register Undef;
+  SmallVector<Register, 32> BuildVec;
+  LLT EltTy = DstTy.getElementType();
+
+  for (int Idx : Mask) {
+    if (Idx < 0) {
+      if (!Undef.isValid())
+        Undef = MIRBuilder.buildUndef(EltTy).getReg(0);
+      BuildVec.push_back(Undef);
+      continue;
+    }
+
+    if (Src0Ty.isScalar()) {
+      BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
+    } else {
+      int NumElts = Src0Ty.getNumElements();
+      Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
+      int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
+      auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
+      auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
+      BuildVec.push_back(Extract.getReg(0));
+    }
+  }
+
+  MIRBuilder.buildBuildVector(DstReg, BuildVec);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register AllocSize = MI.getOperand(1).getReg();
+  unsigned Align = MI.getOperand(2).getImm();
+
+  const auto &MF = *MI.getMF();
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+
+  LLT PtrTy = MRI.getType(Dst);
+  LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
+
+  Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
+  auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
+  SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
+
+  // Subtract the final alloc from the SP. We use G_PTRTOINT here so we don't
+  // have to generate an extra instruction to negate the alloc and then use
+  // G_GEP to add the negative offset.
+  auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
+  if (Align) {
+    APInt AlignMask(IntPtrTy.getSizeInBits(), Align, true);
+    AlignMask.negate();
+    auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
+    Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
+  }
+
+  SPTmp = MIRBuilder.buildCast(PtrTy, Alloc);
+  MIRBuilder.buildCopy(SPReg, SPTmp);
+  MIRBuilder.buildCopy(Dst, SPTmp);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerExtract(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  unsigned Offset = MI.getOperand(2).getImm();
+
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
+
+  if (DstTy.isScalar() &&
+      (SrcTy.isScalar() ||
+       (SrcTy.isVector() && DstTy == SrcTy.getElementType()))) {
+    LLT SrcIntTy = SrcTy;
+    if (!SrcTy.isScalar()) {
+      SrcIntTy = LLT::scalar(SrcTy.getSizeInBits());
+      Src = MIRBuilder.buildBitcast(SrcIntTy, Src).getReg(0);
+    }
+
+    if (Offset == 0)
+      MIRBuilder.buildTrunc(Dst, Src);
+    else {
+      auto ShiftAmt = MIRBuilder.buildConstant(SrcIntTy, Offset);
+      auto Shr = MIRBuilder.buildLShr(SrcIntTy, Src, ShiftAmt);
+      MIRBuilder.buildTrunc(Dst, Shr);
+    }
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  Register InsertSrc = MI.getOperand(2).getReg();
+  uint64_t Offset = MI.getOperand(3).getImm();
+
+  LLT DstTy = MRI.getType(Src);
+  LLT InsertTy = MRI.getType(InsertSrc);
+
+  if (InsertTy.isScalar() &&
+      (DstTy.isScalar() ||
+       (DstTy.isVector() && DstTy.getElementType() == InsertTy))) {
+    LLT IntDstTy = DstTy;
+    if (!DstTy.isScalar()) {
+      IntDstTy = LLT::scalar(DstTy.getSizeInBits());
+      Src = MIRBuilder.buildBitcast(IntDstTy, Src).getReg(0);
+    }
+
+    Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
+    if (Offset != 0) {
+      auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
+      ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
+    }
+
+    APInt MaskVal = ~APInt::getBitsSet(DstTy.getSizeInBits(), Offset,
+                                       InsertTy.getSizeInBits());
+
+    auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
+    auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
+    auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
+
+    MIRBuilder.buildBitcast(Dst, Or);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerSADDO_SSUBO(MachineInstr &MI) {
+  Register Dst0 = MI.getOperand(0).getReg();
+  Register Dst1 = MI.getOperand(1).getReg();
+  Register LHS = MI.getOperand(2).getReg();
+  Register RHS = MI.getOperand(3).getReg();
+  const bool IsAdd = MI.getOpcode() == TargetOpcode::G_SADDO;
+
+  LLT Ty = MRI.getType(Dst0);
+  LLT BoolTy = MRI.getType(Dst1);
+
+  if (IsAdd)
+    MIRBuilder.buildAdd(Dst0, LHS, RHS);
+  else
+    MIRBuilder.buildSub(Dst0, LHS, RHS);
+
+  // TODO: If SADDSAT/SSUBSAT is legal, compare results to detect overflow.
+
+  auto Zero = MIRBuilder.buildConstant(Ty, 0);
+
+  // For an addition, the result should be less than one of the operands (LHS)
+  // if and only if the other operand (RHS) is negative, otherwise there will
+  // be overflow.
+  // For a subtraction, the result should be less than one of the operands
+  // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
+  // otherwise there will be overflow.
+  auto ResultLowerThanLHS =
+      MIRBuilder.buildICmp(CmpInst::ICMP_SLT, BoolTy, Dst0, LHS);
+  auto ConditionRHS = MIRBuilder.buildICmp(
+      IsAdd ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, BoolTy, RHS, Zero);
+
+  MIRBuilder.buildXor(Dst1, ConditionRHS, ResultLowerThanLHS);
+  MI.eraseFromParent();
+  return Legalized;
+}
diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index 6e1de95b3277..70045512fae5 100644
--- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -215,7 +215,30 @@ bool LegalizeRuleSet::verifyTypeIdxsCoverage(unsigned NumTypeIdxs) const {
     return true;
   }
   const bool AllCovered = (FirstUncovered >= NumTypeIdxs);
-  LLVM_DEBUG(dbgs() << ".. the first uncovered type index: " << FirstUncovered
+  if (NumTypeIdxs > 0)
+    LLVM_DEBUG(dbgs() << ".. the first uncovered type index: " << FirstUncovered
+                      << ", " << (AllCovered ? "OK" : "FAIL") << "\n");
+  return AllCovered;
+#else
+  return true;
+#endif
+}
+
+bool LegalizeRuleSet::verifyImmIdxsCoverage(unsigned NumImmIdxs) const {
+#ifndef NDEBUG
+  if (Rules.empty()) {
+    LLVM_DEBUG(
+        dbgs() << ".. imm index coverage check SKIPPED: no rules defined\n");
+    return true;
+  }
+  const int64_t FirstUncovered = ImmIdxsCovered.find_first_unset();
+  if (FirstUncovered < 0) {
+    LLVM_DEBUG(dbgs() << ".. imm index coverage check SKIPPED:"
+                         " user-defined predicate detected\n");
+    return true;
+  }
+  const bool AllCovered = (FirstUncovered >= NumImmIdxs);
+  LLVM_DEBUG(dbgs() << ".. the first uncovered imm index: " << FirstUncovered
                     << ", " << (AllCovered ? "OK" : "FAIL") << "\n");
   return AllCovered;
 #else
@@ -387,8 +410,6 @@ unsigned LegalizerInfo::getActionDefinitionsIdx(unsigned Opcode) const {
     LLVM_DEBUG(dbgs() << ".. opcode " << Opcode << " is aliased to " << Alias
                       << "\n");
     OpcodeIdx = getOpcodeIdxForOpcode(Alias);
-    LLVM_DEBUG(dbgs() << ".. opcode " << Alias << " is aliased to "
-                      << RulesForOpcode[OpcodeIdx].getAlias() << "\n");
     assert(RulesForOpcode[OpcodeIdx].getAlias() == 0 && "Cannot chain aliases");
   }
 
@@ -412,7 +433,7 @@ LegalizeRuleSet &LegalizerInfo::getActionDefinitionsBuilder(
     std::initializer_list<unsigned> Opcodes) {
   unsigned Representative = *Opcodes.begin();
 
-  assert(!empty(Opcodes) && Opcodes.begin() + 1 != Opcodes.end() &&
+  assert(!llvm::empty(Opcodes) && Opcodes.begin() + 1 != Opcodes.end() &&
          "Initializer list must have at least two opcodes");
 
   for (auto I = Opcodes.begin() + 1, E = Opcodes.end(); I != E; ++I)
@@ -677,12 +698,23 @@ void LegalizerInfo::verify(const MCInstrInfo &MII) const {
                      ? std::max(OpInfo.getGenericTypeIndex() + 1U, Acc)
                      : Acc;
         });
+    const unsigned NumImmIdxs = std::accumulate(
+        MCID.opInfo_begin(), MCID.opInfo_end(), 0U,
+        [](unsigned Acc, const MCOperandInfo &OpInfo) {
+          return OpInfo.isGenericImm()
+                     ? std::max(OpInfo.getGenericImmIndex() + 1U, Acc)
+                     : Acc;
+        });
     LLVM_DEBUG(dbgs() << MII.getName(Opcode) << " (opcode " << Opcode
                       << "): " << NumTypeIdxs << " type ind"
-                      << (NumTypeIdxs == 1 ? "ex" : "ices") << "\n");
+                      << (NumTypeIdxs == 1 ? "ex" : "ices") << ", "
+                      << NumImmIdxs << " imm ind"
+                      << (NumImmIdxs == 1 ? "ex" : "ices") << "\n");
     const LegalizeRuleSet &RuleSet = getActionDefinitions(Opcode);
     if (!RuleSet.verifyTypeIdxsCoverage(NumTypeIdxs))
       FailedOpcodes.push_back(Opcode);
+    else if (!RuleSet.verifyImmIdxsCoverage(NumImmIdxs))
+      FailedOpcodes.push_back(Opcode);
   }
   if (!FailedOpcodes.empty()) {
     errs() << "The following opcodes have ill-defined legalization rules:";
diff --git a/lib/CodeGen/GlobalISel/Localizer.cpp b/lib/CodeGen/GlobalISel/Localizer.cpp
index 3592409710a7..f882ecbf5db3 100644
--- a/lib/CodeGen/GlobalISel/Localizer.cpp
+++ b/lib/CodeGen/GlobalISel/Localizer.cpp
@@ -79,7 +79,7 @@ bool Localizer::shouldLocalize(const MachineInstr &MI) {
     return true;
   case TargetOpcode::G_GLOBAL_VALUE: {
     unsigned RematCost = TTI->getGISelRematGlobalCost();
-    unsigned Reg = MI.getOperand(0).getReg();
+    Register Reg = MI.getOperand(0).getReg();
     unsigned MaxUses = maxUses(RematCost);
     if (MaxUses == UINT_MAX)
       return true; // Remats are "free" so always localize.
@@ -121,7 +121,7 @@ bool Localizer::localizeInterBlock(MachineFunction &MF,
     LLVM_DEBUG(dbgs() << "Should localize: " << MI);
     assert(MI.getDesc().getNumDefs() == 1 &&
            "More than one definition not supported yet");
-    unsigned Reg = MI.getOperand(0).getReg();
+    Register Reg = MI.getOperand(0).getReg();
     // Check if all the users of MI are local.
     // We are going to invalidation the list of use operands, so we
     // can't use range iterator.
@@ -151,7 +151,7 @@ bool Localizer::localizeInterBlock(MachineFunction &MF,
                             LocalizedMI);
 
         // Set a new register for the definition.
-        unsigned NewReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
+        Register NewReg = MRI->createGenericVirtualRegister(MRI->getType(Reg));
         MRI->setRegClassOrRegBank(NewReg, MRI->getRegClassOrRegBank(Reg));
         LocalizedMI->getOperand(0).setReg(NewReg);
         NewVRegIt =
@@ -177,7 +177,7 @@ bool Localizer::localizeIntraBlock(LocalizedSetVecT &LocalizedInstrs) {
   // many users, but this case may be better served by regalloc improvements.
 
   for (MachineInstr *MI : LocalizedInstrs) {
-    unsigned Reg = MI->getOperand(0).getReg();
+    Register Reg = MI->getOperand(0).getReg();
     MachineBasicBlock &MBB = *MI->getParent();
     // All of the user MIs of this reg.
     SmallPtrSet<MachineInstr *, 32> Users;
@@ -220,5 +220,6 @@ bool Localizer::runOnMachineFunction(MachineFunction &MF) {
   LocalizedSetVecT LocalizedInstrs;
 
   bool Changed = localizeInterBlock(MF, LocalizedInstrs);
-  return Changed |= localizeIntraBlock(LocalizedInstrs);
+  Changed |= localizeIntraBlock(LocalizedInstrs);
+  return Changed;
 }
diff --git a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index b7a73326b85c..df770f6664ca 100644
--- a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -107,9 +107,13 @@ MachineIRBuilder::buildIndirectDbgValue(Register Reg, const MDNode *Variable,
   assert(
       cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(getDL()) &&
       "Expected inlined-at fields to agree");
+  // DBG_VALUE insts now carry IR-level indirection in their DIExpression
+  // rather than encoding it in the instruction itself.
+  const DIExpression *DIExpr = cast<DIExpression>(Expr);
+  DIExpr = DIExpression::append(DIExpr, {dwarf::DW_OP_deref});
   return insertInstr(BuildMI(getMF(), getDL(),
                              getTII().get(TargetOpcode::DBG_VALUE),
-                             /*IsIndirect*/ true, Reg, Variable, Expr));
+                             /*IsIndirect*/ false, Reg, Variable, DIExpr));
 }
 
 MachineInstrBuilder MachineIRBuilder::buildFIDbgValue(int FI,
@@ -120,11 +124,15 @@ MachineInstrBuilder MachineIRBuilder::buildFIDbgValue(int FI,
   assert(
       cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(getDL()) &&
       "Expected inlined-at fields to agree");
+  // DBG_VALUE insts now carry IR-level indirection in their DIExpression
+  // rather than encoding it in the instruction itself.
+  const DIExpression *DIExpr = cast<DIExpression>(Expr);
+  DIExpr = DIExpression::append(DIExpr, {dwarf::DW_OP_deref});
   return buildInstr(TargetOpcode::DBG_VALUE)
       .addFrameIndex(FI)
-      .addImm(0)
+      .addReg(0)
       .addMetadata(Variable)
-      .addMetadata(Expr);
+      .addMetadata(DIExpr);
 }
 
 MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C,
@@ -148,7 +156,7 @@ MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C,
     MIB.addReg(0U);
   }
 
-  return MIB.addImm(0).addMetadata(Variable).addMetadata(Expr);
+  return MIB.addReg(0).addMetadata(Variable).addMetadata(Expr);
 }
 
 MachineInstrBuilder MachineIRBuilder::buildDbgLabel(const MDNode *Label) {
@@ -160,6 +168,17 @@ MachineInstrBuilder MachineIRBuilder::buildDbgLabel(const MDNode *Label) {
   return MIB.addMetadata(Label);
 }
 
+MachineInstrBuilder MachineIRBuilder::buildDynStackAlloc(const DstOp &Res,
+                                                         const SrcOp &Size,
+                                                         unsigned Align) {
+  assert(Res.getLLTTy(*getMRI()).isPointer() && "expected ptr dst type");
+  auto MIB = buildInstr(TargetOpcode::G_DYN_STACKALLOC);
+  Res.addDefToMIB(*getMRI(), MIB);
+  Size.addSrcToMIB(MIB);
+  MIB.addImm(Align);
+  return MIB;
+}
+
 MachineInstrBuilder MachineIRBuilder::buildFrameIndex(const DstOp &Res,
                                                       int Idx) {
   assert(Res.getLLTTy(*getMRI()).isPointer() && "invalid operand type");
@@ -207,11 +226,7 @@ MachineInstrBuilder MachineIRBuilder::buildGEP(const DstOp &Res,
          Res.getLLTTy(*getMRI()) == Op0.getLLTTy(*getMRI()) && "type mismatch");
   assert(Op1.getLLTTy(*getMRI()).isScalar() && "invalid offset type");
 
-  auto MIB = buildInstr(TargetOpcode::G_GEP);
-  Res.addDefToMIB(*getMRI(), MIB);
-  Op0.addSrcToMIB(MIB);
-  Op1.addSrcToMIB(MIB);
-  return MIB;
+  return buildInstr(TargetOpcode::G_GEP, {Res}, {Op0, Op1});
 }
 
 Optional<MachineInstrBuilder>
@@ -697,17 +712,19 @@ MachineInstrBuilder MachineIRBuilder::buildICmp(CmpInst::Predicate Pred,
 MachineInstrBuilder MachineIRBuilder::buildFCmp(CmpInst::Predicate Pred,
                                                 const DstOp &Res,
                                                 const SrcOp &Op0,
-                                                const SrcOp &Op1) {
+                                                const SrcOp &Op1,
+                                                Optional<unsigned> Flags) {
 
-  return buildInstr(TargetOpcode::G_FCMP, Res, {Pred, Op0, Op1});
+  return buildInstr(TargetOpcode::G_FCMP, Res, {Pred, Op0, Op1}, Flags);
 }
 
 MachineInstrBuilder MachineIRBuilder::buildSelect(const DstOp &Res,
                                                   const SrcOp &Tst,
                                                   const SrcOp &Op0,
-                                                  const SrcOp &Op1) {
+                                                  const SrcOp &Op1,
+                                                  Optional<unsigned> Flags) {
 
-  return buildInstr(TargetOpcode::G_SELECT, {Res}, {Tst, Op0, Op1});
+  return buildInstr(TargetOpcode::G_SELECT, {Res}, {Tst, Op0, Op1}, Flags);
 }
 
 MachineInstrBuilder
@@ -774,26 +791,28 @@ MachineIRBuilder::buildAtomicCmpXchg(Register OldValRes, Register Addr,
       .addMemOperand(&MMO);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildAtomicRMW(unsigned Opcode,
-                                                     Register OldValRes,
-                                                     Register Addr,
-                                                     Register Val,
-                                                     MachineMemOperand &MMO) {
+MachineInstrBuilder MachineIRBuilder::buildAtomicRMW(
+  unsigned Opcode, const DstOp &OldValRes,
+  const SrcOp &Addr, const SrcOp &Val,
+  MachineMemOperand &MMO) {
+
 #ifndef NDEBUG
-  LLT OldValResTy = getMRI()->getType(OldValRes);
-  LLT AddrTy = getMRI()->getType(Addr);
-  LLT ValTy = getMRI()->getType(Val);
+  LLT OldValResTy = OldValRes.getLLTTy(*getMRI());
+  LLT AddrTy = Addr.getLLTTy(*getMRI());
+  LLT ValTy = Val.getLLTTy(*getMRI());
   assert(OldValResTy.isScalar() && "invalid operand type");
   assert(AddrTy.isPointer() && "invalid operand type");
   assert(ValTy.isValid() && "invalid operand type");
   assert(OldValResTy == ValTy && "type mismatch");
+  assert(MMO.isAtomic() && "not atomic mem operand");
 #endif
 
-  return buildInstr(Opcode)
-      .addDef(OldValRes)
-      .addUse(Addr)
-      .addUse(Val)
-      .addMemOperand(&MMO);
+  auto MIB = buildInstr(Opcode);
+  OldValRes.addDefToMIB(*getMRI(), MIB);
+  Addr.addSrcToMIB(MIB);
+  Val.addSrcToMIB(MIB);
+  MIB.addMemOperand(&MMO);
+  return MIB;
 }
 
 MachineInstrBuilder
@@ -865,6 +884,21 @@ MachineIRBuilder::buildAtomicRMWUmin(Register OldValRes, Register Addr,
 }
 
 MachineInstrBuilder
+MachineIRBuilder::buildAtomicRMWFAdd(
+  const DstOp &OldValRes, const SrcOp &Addr, const SrcOp &Val,
+  MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_FADD, OldValRes, Addr, Val,
+                        MMO);
+}
+
+MachineInstrBuilder
+MachineIRBuilder::buildAtomicRMWFSub(const DstOp &OldValRes, const SrcOp &Addr, const SrcOp &Val,
+                                     MachineMemOperand &MMO) {
+  return buildAtomicRMW(TargetOpcode::G_ATOMICRMW_FSUB, OldValRes, Addr, Val,
+                        MMO);
+}
+
+MachineInstrBuilder
 MachineIRBuilder::buildFence(unsigned Ordering, unsigned Scope) {
   return buildInstr(TargetOpcode::G_FENCE)
     .addImm(Ordering)
@@ -1037,8 +1071,11 @@ MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opc,
            "input operands do not cover output register");
     if (SrcOps.size() == 1)
       return buildCast(DstOps[0], SrcOps[0]);
-    if (DstOps[0].getLLTTy(*getMRI()).isVector())
-      return buildInstr(TargetOpcode::G_CONCAT_VECTORS, DstOps, SrcOps);
+    if (DstOps[0].getLLTTy(*getMRI()).isVector()) {
+      if (SrcOps[0].getLLTTy(*getMRI()).isVector())
+        return buildInstr(TargetOpcode::G_CONCAT_VECTORS, DstOps, SrcOps);
+      return buildInstr(TargetOpcode::G_BUILD_VECTOR, DstOps, SrcOps);
+    }
     break;
   }
   case TargetOpcode::G_EXTRACT_VECTOR_ELT: {
diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 42be88fcf947..f0e35c65c53b 100644
--- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -92,7 +92,7 @@ void RegBankSelect::init(MachineFunction &MF) {
     MBPI = nullptr;
   }
   MIRBuilder.setMF(MF);
-  MORE = llvm::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI);
+  MORE = std::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI);
 }
 
 void RegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -139,7 +139,7 @@ bool RegBankSelect::repairReg(
          "need new vreg for each breakdown");
 
   // An empty range of new register means no repairing.
-  assert(!empty(NewVRegs) && "We should not have to repair");
+  assert(!NewVRegs.empty() && "We should not have to repair");
 
   MachineInstr *MI;
   if (ValMapping.NumBreakDowns == 1) {
@@ -154,7 +154,7 @@ bool RegBankSelect::repairReg(
       std::swap(Src, Dst);
 
     assert((RepairPt.getNumInsertPoints() == 1 ||
-            TargetRegisterInfo::isPhysicalRegister(Dst)) &&
+            Register::isPhysicalRegister(Dst)) &&
            "We are about to create several defs for Dst");
 
     // Build the instruction used to repair, then clone it at the right
@@ -398,7 +398,7 @@ void RegBankSelect::tryAvoidingSplit(
 
   // Check if this is a physical or virtual register.
   Register Reg = MO.getReg();
-  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+  if (Register::isPhysicalRegister(Reg)) {
     // We are going to split every outgoing edges.
     // Check that this is possible.
     // FIXME: The machine representation is currently broken
@@ -687,8 +687,9 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {
       // iterator before hand.
       MachineInstr &MI = *MII++;
 
-      // Ignore target-specific instructions: they should use proper regclasses.
-      if (isTargetSpecificOpcode(MI.getOpcode()))
+      // Ignore target-specific post-isel instructions: they should use proper
+      // regclasses.
+      if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode())
         continue;
 
       if (!assignInstr(MI)) {
diff --git a/lib/CodeGen/GlobalISel/RegisterBank.cpp b/lib/CodeGen/GlobalISel/RegisterBank.cpp
index 4e41f338934d..fc9c802693ab 100644
--- a/lib/CodeGen/GlobalISel/RegisterBank.cpp
+++ b/lib/CodeGen/GlobalISel/RegisterBank.cpp
@@ -12,6 +12,7 @@
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "registerbank"
 
diff --git a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
index 159422e38878..3fcc55286beb 100644
--- a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
+++ b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -82,7 +82,7 @@ bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const {
 const RegisterBank *
 RegisterBankInfo::getRegBank(Register Reg, const MachineRegisterInfo &MRI,
                              const TargetRegisterInfo &TRI) const {
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+  if (Register::isPhysicalRegister(Reg))
     return &getRegBankFromRegClass(getMinimalPhysRegClass(Reg, TRI));
 
   assert(Reg && "NoRegister does not have a register bank");
@@ -97,8 +97,7 @@ RegisterBankInfo::getRegBank(Register Reg, const MachineRegisterInfo &MRI,
 const TargetRegisterClass &
 RegisterBankInfo::getMinimalPhysRegClass(Register Reg,
                                          const TargetRegisterInfo &TRI) const {
-  assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
-         "Reg must be a physreg");
+  assert(Register::isPhysicalRegister(Reg) && "Reg must be a physreg");
   const auto &RegRCIt = PhysRegMinimalRCs.find(Reg);
   if (RegRCIt != PhysRegMinimalRCs.end())
     return *RegRCIt->second;
@@ -284,7 +283,7 @@ RegisterBankInfo::getPartialMapping(unsigned StartIdx, unsigned Length,
   ++NumPartialMappingsCreated;
 
   auto &PartMapping = MapOfPartialMappings[Hash];
-  PartMapping = llvm::make_unique<PartialMapping>(StartIdx, Length, RegBank);
+  PartMapping = std::make_unique<PartialMapping>(StartIdx, Length, RegBank);
   return *PartMapping;
 }
 
@@ -318,7 +317,7 @@ RegisterBankInfo::getValueMapping(const PartialMapping *BreakDown,
   ++NumValueMappingsCreated;
 
   auto &ValMapping = MapOfValueMappings[Hash];
-  ValMapping = llvm::make_unique<ValueMapping>(BreakDown, NumBreakDowns);
+  ValMapping = std::make_unique<ValueMapping>(BreakDown, NumBreakDowns);
   return *ValMapping;
 }
 
@@ -342,7 +341,7 @@ RegisterBankInfo::getOperandsMapping(Iterator Begin, Iterator End) const {
   // mapping, because we use the pointer of the ValueMapping
   // to hash and we expect them to uniquely identify an instance
   // of value mapping.
-  Res = llvm::make_unique<ValueMapping[]>(std::distance(Begin, End));
+  Res = std::make_unique<ValueMapping[]>(std::distance(Begin, End));
   unsigned Idx = 0;
   for (Iterator It = Begin; It != End; ++It, ++Idx) {
     const ValueMapping *ValMap = *It;
@@ -392,7 +391,7 @@ RegisterBankInfo::getInstructionMappingImpl(
   ++NumInstructionMappingsCreated;
 
   auto &InstrMapping = MapOfInstructionMappings[Hash];
-  InstrMapping = llvm::make_unique<InstructionMapping>(
+  InstrMapping = std::make_unique<InstructionMapping>(
       ID, Cost, OperandsMapping, NumOperands);
   return *InstrMapping;
 }
@@ -456,7 +455,7 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
            "This mapping is too complex for this function");
     iterator_range<SmallVectorImpl<Register>::const_iterator> NewRegs =
         OpdMapper.getVRegs(OpIdx);
-    if (empty(NewRegs)) {
+    if (NewRegs.empty()) {
       LLVM_DEBUG(dbgs() << " has not been repaired, nothing to be done\n");
       continue;
     }
@@ -489,7 +488,7 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
 unsigned RegisterBankInfo::getSizeInBits(Register Reg,
                                          const MachineRegisterInfo &MRI,
                                          const TargetRegisterInfo &TRI) const {
-  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+  if (Register::isPhysicalRegister(Reg)) {
     // The size is not directly available for physical registers.
     // Instead, we need to access a register class that contains Reg and
     // get the size of that register class.
diff --git a/lib/CodeGen/GlobalISel/Utils.cpp b/lib/CodeGen/GlobalISel/Utils.cpp
index 766ea1d60bac..45618d7992ad 100644
--- a/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/lib/CodeGen/GlobalISel/Utils.cpp
@@ -43,10 +43,9 @@ unsigned llvm::constrainOperandRegClass(
     const RegisterBankInfo &RBI, MachineInstr &InsertPt,
     const TargetRegisterClass &RegClass, const MachineOperand &RegMO,
     unsigned OpIdx) {
-  unsigned Reg = RegMO.getReg();
+  Register Reg = RegMO.getReg();
   // Assume physical registers are properly constrained.
-  assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
-         "PhysReg not implemented");
+  assert(Register::isVirtualRegister(Reg) && "PhysReg not implemented");
 
   unsigned ConstrainedReg = constrainRegToClass(MRI, TII, RBI, Reg, RegClass);
   // If we created a new virtual register because the class is not compatible
@@ -73,10 +72,9 @@ unsigned llvm::constrainOperandRegClass(
     MachineRegisterInfo &MRI, const TargetInstrInfo &TII,
     const RegisterBankInfo &RBI, MachineInstr &InsertPt, const MCInstrDesc &II,
     const MachineOperand &RegMO, unsigned OpIdx) {
-  unsigned Reg = RegMO.getReg();
+  Register Reg = RegMO.getReg();
   // Assume physical registers are properly constrained.
-  assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
-         "PhysReg not implemented");
+  assert(Register::isVirtualRegister(Reg) && "PhysReg not implemented");
 
   const TargetRegisterClass *RegClass = TII.getRegClass(II, OpIdx, &TRI, MF);
   // Some of the target independent instructions, like COPY, may not impose any
@@ -130,9 +128,9 @@ bool llvm::constrainSelectedInstRegOperands(MachineInstr &I,
     LLVM_DEBUG(dbgs() << "Converting operand: " << MO << '\n');
     assert(MO.isReg() && "Unsupported non-reg operand");
 
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     // Physical registers don't need to be constrained.
-    if (TRI.isPhysicalRegister(Reg))
+    if (Register::isPhysicalRegister(Reg))
       continue;
 
     // Register operands with a value of 0 (e.g. predicate operands) don't need
@@ -170,9 +168,8 @@ bool llvm::isTriviallyDead(const MachineInstr &MI,
     if (!MO.isReg() || !MO.isDef())
       continue;
 
-    unsigned Reg = MO.getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
-        !MRI.use_nodbg_empty(Reg))
+    Register Reg = MO.getReg();
+    if (Register::isPhysicalRegister(Reg) || !MRI.use_nodbg_empty(Reg))
       return false;
   }
   return true;
@@ -219,11 +216,33 @@ Optional<int64_t> llvm::getConstantVRegVal(unsigned VReg,
 }
 
 Optional<ValueAndVReg> llvm::getConstantVRegValWithLookThrough(
-    unsigned VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs) {
+    unsigned VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs,
+    bool HandleFConstant) {
   SmallVector<std::pair<unsigned, unsigned>, 4> SeenOpcodes;
   MachineInstr *MI;
-  while ((MI = MRI.getVRegDef(VReg)) &&
-         MI->getOpcode() != TargetOpcode::G_CONSTANT && LookThroughInstrs) {
+  auto IsConstantOpcode = [HandleFConstant](unsigned Opcode) {
+    return Opcode == TargetOpcode::G_CONSTANT ||
+           (HandleFConstant && Opcode == TargetOpcode::G_FCONSTANT);
+  };
+  auto GetImmediateValue = [HandleFConstant,
+                            &MRI](const MachineInstr &MI) -> Optional<APInt> {
+    const MachineOperand &CstVal = MI.getOperand(1);
+    if (!CstVal.isImm() && !CstVal.isCImm() &&
+        (!HandleFConstant || !CstVal.isFPImm()))
+      return None;
+    if (!CstVal.isFPImm()) {
+      unsigned BitWidth =
+          MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+      APInt Val = CstVal.isImm() ? APInt(BitWidth, CstVal.getImm())
+                                 : CstVal.getCImm()->getValue();
+      assert(Val.getBitWidth() == BitWidth &&
+             "Value bitwidth doesn't match definition type");
+      return Val;
+    }
+    return CstVal.getFPImm()->getValueAPF().bitcastToAPInt();
+  };
+  while ((MI = MRI.getVRegDef(VReg)) && !IsConstantOpcode(MI->getOpcode()) &&
+         LookThroughInstrs) {
     switch (MI->getOpcode()) {
     case TargetOpcode::G_TRUNC:
     case TargetOpcode::G_SEXT:
@@ -235,7 +254,7 @@ Optional<ValueAndVReg> llvm::getConstantVRegValWithLookThrough(
       break;
     case TargetOpcode::COPY:
       VReg = MI->getOperand(1).getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(VReg))
+      if (Register::isPhysicalRegister(VReg))
         return None;
       break;
     case TargetOpcode::G_INTTOPTR:
@@ -245,16 +264,13 @@ Optional<ValueAndVReg> llvm::getConstantVRegValWithLookThrough(
       return None;
     }
   }
-  if (!MI || MI->getOpcode() != TargetOpcode::G_CONSTANT ||
-      (!MI->getOperand(1).isImm() && !MI->getOperand(1).isCImm()))
+  if (!MI || !IsConstantOpcode(MI->getOpcode()))
     return None;
 
-  const MachineOperand &CstVal = MI->getOperand(1);
-  unsigned BitWidth = MRI.getType(MI->getOperand(0).getReg()).getSizeInBits();
-  APInt Val = CstVal.isImm() ? APInt(BitWidth, CstVal.getImm())
-                             : CstVal.getCImm()->getValue();
-  assert(Val.getBitWidth() == BitWidth &&
-         "Value bitwidth doesn't match definition type");
+  Optional<APInt> MaybeVal = GetImmediateValue(*MI);
+  if (!MaybeVal)
+    return None;
+  APInt &Val = *MaybeVal;
   while (!SeenOpcodes.empty()) {
     std::pair<unsigned, unsigned> OpcodeAndSize = SeenOpcodes.pop_back_val();
     switch (OpcodeAndSize.first) {
@@ -291,7 +307,7 @@ llvm::MachineInstr *llvm::getDefIgnoringCopies(Register Reg,
   if (!DstTy.isValid())
     return nullptr;
   while (DefMI->getOpcode() == TargetOpcode::COPY) {
-    unsigned SrcReg = DefMI->getOperand(1).getReg();
+    Register SrcReg = DefMI->getOperand(1).getReg();
     auto SrcTy = MRI.getType(SrcReg);
     if (!SrcTy.isValid() || SrcTy != DstTy)
       break;
@@ -395,6 +411,40 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
   return false;
 }
 
+Optional<APInt> llvm::ConstantFoldExtOp(unsigned Opcode, const unsigned Op1,
+                                        uint64_t Imm,
+                                        const MachineRegisterInfo &MRI) {
+  auto MaybeOp1Cst = getConstantVRegVal(Op1, MRI);
+  if (MaybeOp1Cst) {
+    LLT Ty = MRI.getType(Op1);
+    APInt C1(Ty.getSizeInBits(), *MaybeOp1Cst, true);
+    switch (Opcode) {
+    default:
+      break;
+    case TargetOpcode::G_SEXT_INREG:
+      return C1.trunc(Imm).sext(C1.getBitWidth());
+    }
+  }
+  return None;
+}
+
 void llvm::getSelectionDAGFallbackAnalysisUsage(AnalysisUsage &AU) {
   AU.addPreserved<StackProtector>();
 }
+
+MVT llvm::getMVTForLLT(LLT Ty) {
+  if (!Ty.isVector())
+    return MVT::getIntegerVT(Ty.getSizeInBits());
+
+  return MVT::getVectorVT(
+      MVT::getIntegerVT(Ty.getElementType().getSizeInBits()),
+      Ty.getNumElements());
+}
+
+LLT llvm::getLLTForMVT(MVT Ty) {
+  if (!Ty.isVector())
+    return LLT::scalar(Ty.getSizeInBits());
+
+  return LLT::vector(Ty.getVectorNumElements(),
+                     Ty.getVectorElementType().getSizeInBits());
+}
diff --git a/lib/CodeGen/GlobalMerge.cpp b/lib/CodeGen/GlobalMerge.cpp
index 09201c2e7bae..d4fa45fcb405 100644
--- a/lib/CodeGen/GlobalMerge.cpp
+++ b/lib/CodeGen/GlobalMerge.cpp
@@ -456,14 +456,14 @@ bool GlobalMerge::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
 
     bool HasExternal = false;
     StringRef FirstExternalName;
-    unsigned MaxAlign = 1;
+    Align MaxAlign;
     unsigned CurIdx = 0;
     for (j = i; j != -1; j = GlobalSet.find_next(j)) {
       Type *Ty = Globals[j]->getValueType();
 
       // Make sure we use the same alignment AsmPrinter would use.
-      unsigned Align = DL.getPreferredAlignment(Globals[j]);
-      unsigned Padding = alignTo(MergedSize, Align) - MergedSize;
+      Align Alignment(DL.getPreferredAlignment(Globals[j]));
+      unsigned Padding = alignTo(MergedSize, Alignment) - MergedSize;
       MergedSize += Padding;
       MergedSize += DL.getTypeAllocSize(Ty);
       if (MergedSize > MaxOffset) {
@@ -478,7 +478,7 @@ bool GlobalMerge::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
       Inits.push_back(Globals[j]->getInitializer());
       StructIdxs.push_back(CurIdx++);
 
-      MaxAlign = std::max(MaxAlign, Align);
+      MaxAlign = std::max(MaxAlign, Alignment);
 
       if (Globals[j]->hasExternalLinkage() && !HasExternal) {
         HasExternal = true;
diff --git a/lib/CodeGen/HardwareLoops.cpp b/lib/CodeGen/HardwareLoops.cpp
index 5f57cabbe865..6a0f98d2e2b4 100644
--- a/lib/CodeGen/HardwareLoops.cpp
+++ b/lib/CodeGen/HardwareLoops.cpp
@@ -183,7 +183,7 @@ bool HardwareLoops::runOnFunction(Function &F) {
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   DL = &F.getParent()->getDataLayout();
   auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
-  LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
+  LibInfo = TLIP ? &TLIP->getTLI(F) : nullptr;
   PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   M = F.getParent();
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index b17a253fe23f..d9caa5660695 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -285,14 +285,113 @@ namespace {
                                                    Prediction);
     }
 
-    bool MeetIfcvtSizeLimit(MachineBasicBlock &TBB,
-                            unsigned TCycle, unsigned TExtra,
-                            MachineBasicBlock &FBB,
-                            unsigned FCycle, unsigned FExtra,
-                            BranchProbability Prediction) const {
-      return TCycle > 0 && FCycle > 0 &&
-        TII->isProfitableToIfCvt(TBB, TCycle, TExtra, FBB, FCycle, FExtra,
-                                 Prediction);
+    bool MeetIfcvtSizeLimit(BBInfo &TBBInfo, BBInfo &FBBInfo,
+                            MachineBasicBlock &CommBB, unsigned Dups,
+                            BranchProbability Prediction, bool Forked) const {
+      const MachineFunction &MF = *TBBInfo.BB->getParent();
+      if (MF.getFunction().hasMinSize()) {
+        MachineBasicBlock::iterator TIB = TBBInfo.BB->begin();
+        MachineBasicBlock::iterator FIB = FBBInfo.BB->begin();
+        MachineBasicBlock::iterator TIE = TBBInfo.BB->end();
+        MachineBasicBlock::iterator FIE = FBBInfo.BB->end();
+
+        unsigned Dups1, Dups2;
+        if (!CountDuplicatedInstructions(TIB, FIB, TIE, FIE, Dups1, Dups2,
+                                         *TBBInfo.BB, *FBBInfo.BB,
+                                         /*SkipUnconditionalBranches*/ true))
+          llvm_unreachable("should already have been checked by ValidDiamond");
+
+        unsigned BranchBytes = 0;
+        unsigned CommonBytes = 0;
+
+        // Count common instructions at the start of the true and false blocks.
+        for (auto &I : make_range(TBBInfo.BB->begin(), TIB)) {
+          LLVM_DEBUG(dbgs() << "Common inst: " << I);
+          CommonBytes += TII->getInstSizeInBytes(I);
+        }
+        for (auto &I : make_range(FBBInfo.BB->begin(), FIB)) {
+          LLVM_DEBUG(dbgs() << "Common inst: " << I);
+          CommonBytes += TII->getInstSizeInBytes(I);
+        }
+
+        // Count instructions at the end of the true and false blocks, after
+        // the ones we plan to predicate. Analyzable branches will be removed
+        // (unless this is a forked diamond), and all other instructions are
+        // common between the two blocks.
+        for (auto &I : make_range(TIE, TBBInfo.BB->end())) {
+          if (I.isBranch() && TBBInfo.IsBrAnalyzable && !Forked) {
+            LLVM_DEBUG(dbgs() << "Saving branch: " << I);
+            BranchBytes += TII->predictBranchSizeForIfCvt(I);
+          } else {
+            LLVM_DEBUG(dbgs() << "Common inst: " << I);
+            CommonBytes += TII->getInstSizeInBytes(I);
+          }
+        }
+        for (auto &I : make_range(FIE, FBBInfo.BB->end())) {
+          if (I.isBranch() && FBBInfo.IsBrAnalyzable && !Forked) {
+            LLVM_DEBUG(dbgs() << "Saving branch: " << I);
+            BranchBytes += TII->predictBranchSizeForIfCvt(I);
+          } else {
+            LLVM_DEBUG(dbgs() << "Common inst: " << I);
+            CommonBytes += TII->getInstSizeInBytes(I);
+          }
+        }
+        for (auto &I : CommBB.terminators()) {
+          if (I.isBranch()) {
+            LLVM_DEBUG(dbgs() << "Saving branch: " << I);
+            BranchBytes += TII->predictBranchSizeForIfCvt(I);
+          }
+        }
+
+        // The common instructions in one branch will be eliminated, halving
+        // their code size.
+        CommonBytes /= 2;
+
+        // Count the instructions which we need to predicate.
+        unsigned NumPredicatedInstructions = 0;
+        for (auto &I : make_range(TIB, TIE)) {
+          if (!I.isDebugInstr()) {
+            LLVM_DEBUG(dbgs() << "Predicating: " << I);
+            NumPredicatedInstructions++;
+          }
+        }
+        for (auto &I : make_range(FIB, FIE)) {
+          if (!I.isDebugInstr()) {
+            LLVM_DEBUG(dbgs() << "Predicating: " << I);
+            NumPredicatedInstructions++;
+          }
+        }
+
+        // Even though we're optimising for size at the expense of performance,
+        // avoid creating really long predicated blocks.
+        if (NumPredicatedInstructions > 15)
+          return false;
+
+        // Some targets (e.g. Thumb2) need to insert extra instructions to
+        // start predicated blocks.
+        unsigned ExtraPredicateBytes = TII->extraSizeToPredicateInstructions(
+            MF, NumPredicatedInstructions);
+
+        LLVM_DEBUG(dbgs() << "MeetIfcvtSizeLimit(BranchBytes=" << BranchBytes
+                          << ", CommonBytes=" << CommonBytes
+                          << ", NumPredicatedInstructions="
+                          << NumPredicatedInstructions
+                          << ", ExtraPredicateBytes=" << ExtraPredicateBytes
+                          << ")\n");
+        return (BranchBytes + CommonBytes) > ExtraPredicateBytes;
+      } else {
+        unsigned TCycle = TBBInfo.NonPredSize + TBBInfo.ExtraCost - Dups;
+        unsigned FCycle = FBBInfo.NonPredSize + FBBInfo.ExtraCost - Dups;
+        bool Res = TCycle > 0 && FCycle > 0 &&
+                   TII->isProfitableToIfCvt(
+                       *TBBInfo.BB, TCycle, TBBInfo.ExtraCost2, *FBBInfo.BB,
+                       FCycle, FBBInfo.ExtraCost2, Prediction);
+        LLVM_DEBUG(dbgs() << "MeetIfcvtSizeLimit(TCycle=" << TCycle
+                          << ", FCycle=" << FCycle
+                          << ", TExtra=" << TBBInfo.ExtraCost2 << ", FExtra="
+                          << FBBInfo.ExtraCost2 << ") = " << Res << "\n");
+        return Res;
+      }
     }
 
     /// Returns true if Block ends without a terminator.
@@ -356,8 +455,10 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   if (!PreRegAlloc) {
     // Tail merge tend to expose more if-conversion opportunities.
     BranchFolder BF(true, false, MBFI, *MBPI);
-    BFChange = BF.OptimizeFunction(MF, TII, ST.getRegisterInfo(),
-                                   getAnalysisIfAvailable<MachineModuleInfo>());
+    auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
+    BFChange = BF.OptimizeFunction(
+        MF, TII, ST.getRegisterInfo(),
+        MMIWP ? &MMIWP->getMMI() : nullptr);
   }
 
   LLVM_DEBUG(dbgs() << "\nIfcvt: function (" << ++FnNum << ") \'"
@@ -496,8 +597,10 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
 
   if (MadeChange && IfCvtBranchFold) {
     BranchFolder BF(false, false, MBFI, *MBPI);
-    BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
-                        getAnalysisIfAvailable<MachineModuleInfo>());
+    auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
+    BF.OptimizeFunction(
+        MF, TII, MF.getSubtarget().getRegisterInfo(),
+        MMIWP ? &MMIWP->getMMI() : nullptr);
   }
 
   MadeChange |= BFChange;
@@ -569,6 +672,9 @@ bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
                                 bool FalseBranch, unsigned &Dups,
                                 BranchProbability Prediction) const {
   Dups = 0;
+  if (TrueBBI.BB == FalseBBI.BB)
+    return false;
+
   if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone)
     return false;
 
@@ -835,6 +941,8 @@ bool IfConverter::ValidForkedDiamond(
 
   TrueBBICalc.BB = TrueBBI.BB;
   FalseBBICalc.BB = FalseBBI.BB;
+  TrueBBICalc.IsBrAnalyzable = TrueBBI.IsBrAnalyzable;
+  FalseBBICalc.IsBrAnalyzable = FalseBBI.IsBrAnalyzable;
   if (!RescanInstructions(TIB, FIB, TIE, FIE, TrueBBICalc, FalseBBICalc))
     return false;
 
@@ -892,6 +1000,8 @@ bool IfConverter::ValidDiamond(
 
   TrueBBICalc.BB = TrueBBI.BB;
   FalseBBICalc.BB = FalseBBI.BB;
+  TrueBBICalc.IsBrAnalyzable = TrueBBI.IsBrAnalyzable;
+  FalseBBICalc.IsBrAnalyzable = FalseBBI.IsBrAnalyzable;
   if (!RescanInstructions(TIB, FIB, TIE, FIE, TrueBBICalc, FalseBBICalc))
     return false;
   // The size is used to decide whether to if-convert, and the shared portions
@@ -912,6 +1022,12 @@ void IfConverter::AnalyzeBranches(BBInfo &BBI) {
   BBI.BrCond.clear();
   BBI.IsBrAnalyzable =
       !TII->analyzeBranch(*BBI.BB, BBI.TrueBB, BBI.FalseBB, BBI.BrCond);
+  if (!BBI.IsBrAnalyzable) {
+    BBI.TrueBB = nullptr;
+    BBI.FalseBB = nullptr;
+    BBI.BrCond.clear();
+  }
+
   SmallVector<MachineOperand, 4> RevCond(BBI.BrCond.begin(), BBI.BrCond.end());
   BBI.IsBrReversible = (RevCond.size() == 0) ||
       !TII->reverseBranchCondition(RevCond);
@@ -1173,13 +1289,9 @@ void IfConverter::AnalyzeBlock(
 
     if (CanRevCond) {
       BBInfo TrueBBICalc, FalseBBICalc;
-      auto feasibleDiamond = [&]() {
-        bool MeetsSize = MeetIfcvtSizeLimit(
-            *TrueBBI.BB, (TrueBBICalc.NonPredSize - (Dups + Dups2) +
-                          TrueBBICalc.ExtraCost), TrueBBICalc.ExtraCost2,
-            *FalseBBI.BB, (FalseBBICalc.NonPredSize - (Dups + Dups2) +
-                           FalseBBICalc.ExtraCost), FalseBBICalc.ExtraCost2,
-            Prediction);
+      auto feasibleDiamond = [&](bool Forked) {
+        bool MeetsSize = MeetIfcvtSizeLimit(TrueBBICalc, FalseBBICalc, *BB,
+                                            Dups + Dups2, Prediction, Forked);
         bool TrueFeasible = FeasibilityAnalysis(TrueBBI, BBI.BrCond,
                                                 /* IsTriangle */ false, /* RevCond */ false,
                                                 /* hasCommonTail */ true);
@@ -1191,7 +1303,7 @@ void IfConverter::AnalyzeBlock(
 
       if (ValidDiamond(TrueBBI, FalseBBI, Dups, Dups2,
                        TrueBBICalc, FalseBBICalc)) {
-        if (feasibleDiamond()) {
+        if (feasibleDiamond(false)) {
           // Diamond:
           //   EBB
           //   / \_
@@ -1200,14 +1312,14 @@ void IfConverter::AnalyzeBlock(
           //   \ /
           //  TailBB
           // Note TailBB can be empty.
-          Tokens.push_back(llvm::make_unique<IfcvtToken>(
+          Tokens.push_back(std::make_unique<IfcvtToken>(
               BBI, ICDiamond, TNeedSub | FNeedSub, Dups, Dups2,
               (bool) TrueBBICalc.ClobbersPred, (bool) FalseBBICalc.ClobbersPred));
           Enqueued = true;
         }
       } else if (ValidForkedDiamond(TrueBBI, FalseBBI, Dups, Dups2,
                                     TrueBBICalc, FalseBBICalc)) {
-        if (feasibleDiamond()) {
+        if (feasibleDiamond(true)) {
           // ForkedDiamond:
           // if TBB and FBB have a common tail that includes their conditional
           // branch instructions, then we can If Convert this pattern.
@@ -1218,7 +1330,7 @@ void IfConverter::AnalyzeBlock(
           //        / \ /   \
           //  FalseBB TrueBB FalseBB
           //
-          Tokens.push_back(llvm::make_unique<IfcvtToken>(
+          Tokens.push_back(std::make_unique<IfcvtToken>(
               BBI, ICForkedDiamond, TNeedSub | FNeedSub, Dups, Dups2,
               (bool) TrueBBICalc.ClobbersPred, (bool) FalseBBICalc.ClobbersPred));
           Enqueued = true;
@@ -1238,7 +1350,7 @@ void IfConverter::AnalyzeBlock(
       //   |  /
       //   FBB
       Tokens.push_back(
-          llvm::make_unique<IfcvtToken>(BBI, ICTriangle, TNeedSub, Dups));
+          std::make_unique<IfcvtToken>(BBI, ICTriangle, TNeedSub, Dups));
       Enqueued = true;
     }
 
@@ -1247,7 +1359,7 @@ void IfConverter::AnalyzeBlock(
                            TrueBBI.ExtraCost2, Prediction) &&
         FeasibilityAnalysis(TrueBBI, BBI.BrCond, true, true)) {
       Tokens.push_back(
-          llvm::make_unique<IfcvtToken>(BBI, ICTriangleRev, TNeedSub, Dups));
+          std::make_unique<IfcvtToken>(BBI, ICTriangleRev, TNeedSub, Dups));
       Enqueued = true;
     }
 
@@ -1263,7 +1375,7 @@ void IfConverter::AnalyzeBlock(
       //   |
       //   FBB
       Tokens.push_back(
-          llvm::make_unique<IfcvtToken>(BBI, ICSimple, TNeedSub, Dups));
+          std::make_unique<IfcvtToken>(BBI, ICSimple, TNeedSub, Dups));
       Enqueued = true;
     }
 
@@ -1275,7 +1387,7 @@ void IfConverter::AnalyzeBlock(
                              FalseBBI.NonPredSize + FalseBBI.ExtraCost,
                              FalseBBI.ExtraCost2, Prediction.getCompl()) &&
           FeasibilityAnalysis(FalseBBI, RevCond, true)) {
-        Tokens.push_back(llvm::make_unique<IfcvtToken>(BBI, ICTriangleFalse,
+        Tokens.push_back(std::make_unique<IfcvtToken>(BBI, ICTriangleFalse,
                                                        FNeedSub, Dups));
         Enqueued = true;
       }
@@ -1287,7 +1399,7 @@ void IfConverter::AnalyzeBlock(
                            FalseBBI.ExtraCost2, Prediction.getCompl()) &&
         FeasibilityAnalysis(FalseBBI, RevCond, true, true)) {
         Tokens.push_back(
-            llvm::make_unique<IfcvtToken>(BBI, ICTriangleFRev, FNeedSub, Dups));
+            std::make_unique<IfcvtToken>(BBI, ICTriangleFRev, FNeedSub, Dups));
         Enqueued = true;
       }
 
@@ -1297,7 +1409,7 @@ void IfConverter::AnalyzeBlock(
                              FalseBBI.ExtraCost2, Prediction.getCompl()) &&
           FeasibilityAnalysis(FalseBBI, RevCond)) {
         Tokens.push_back(
-            llvm::make_unique<IfcvtToken>(BBI, ICSimpleFalse, FNeedSub, Dups));
+            std::make_unique<IfcvtToken>(BBI, ICSimpleFalse, FNeedSub, Dups));
         Enqueued = true;
       }
     }
@@ -1730,6 +1842,11 @@ bool IfConverter::IfConvertDiamondCommon(
       ++i;
   }
   while (NumDups1 != 0) {
+    // Since this instruction is going to be deleted, update call
+    // site info state if the instruction is call instruction.
+    if (DI2->isCall(MachineInstr::IgnoreBundle))
+      MBB2.getParent()->eraseCallSiteInfo(&*DI2);
+
     ++DI2;
     if (DI2 == MBB2.end())
       break;
@@ -1758,14 +1875,27 @@ bool IfConverter::IfConvertDiamondCommon(
   if (!BBI1->IsBrAnalyzable)
     verifySameBranchInstructions(&MBB1, &MBB2);
 #endif
-  BBI1->NonPredSize -= TII->removeBranch(*BBI1->BB);
-  // Remove duplicated instructions.
+  // Remove duplicated instructions from the tail of MBB1: any branch
+  // instructions, and the common instructions counted by NumDups2.
   DI1 = MBB1.end();
+  while (DI1 != MBB1.begin()) {
+    MachineBasicBlock::iterator Prev = std::prev(DI1);
+    if (!Prev->isBranch() && !Prev->isDebugInstr())
+      break;
+    DI1 = Prev;
+  }
   for (unsigned i = 0; i != NumDups2; ) {
     // NumDups2 only counted non-dbg_value instructions, so this won't
     // run off the head of the list.
     assert(DI1 != MBB1.begin());
+
     --DI1;
+
+    // Since this instruction is going to be deleted, update call
+    // site info state if the instruction is call instruction.
+    if (DI1->isCall(MachineInstr::IgnoreBundle))
+      MBB1.getParent()->eraseCallSiteInfo(&*DI1);
+
     // skip dbg_value instructions
     if (!DI1->isDebugInstr())
       ++i;
@@ -1815,7 +1945,7 @@ bool IfConverter::IfConvertDiamondCommon(
       for (const MachineOperand &MO : FI.operands()) {
         if (!MO.isReg())
           continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (!Reg)
           continue;
         if (MO.isDef()) {
@@ -1983,7 +2113,7 @@ static bool MaySpeculate(const MachineInstr &MI,
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
     if (MO.isDef() && !LaterRedefs.count(Reg))
@@ -2050,6 +2180,10 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
       break;
 
     MachineInstr *MI = MF.CloneMachineInstr(&I);
+    // Make a copy of the call site info.
+    if (MI->isCall(MachineInstr::IgnoreBundle))
+      MF.copyCallSiteInfo(&I,MI);
+
     ToBBI.BB->insert(ToBBI.BB->end(), MI);
     ToBBI.NonPredSize++;
     unsigned ExtraPredCost = TII->getPredicationCost(I);
diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp
index 1e82ea659617..b7dcaec90106 100644
--- a/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/lib/CodeGen/ImplicitNullChecks.cpp
@@ -278,12 +278,12 @@ bool ImplicitNullChecks::canReorder(const MachineInstr *A,
     if (!(MOA.isReg() && MOA.getReg()))
       continue;
 
-    unsigned RegA = MOA.getReg();
+    Register RegA = MOA.getReg();
     for (auto MOB : B->operands()) {
       if (!(MOB.isReg() && MOB.getReg()))
         continue;
 
-      unsigned RegB = MOB.getReg();
+      Register RegB = MOB.getReg();
 
       if (TRI->regsOverlap(RegA, RegB) && (MOA.isDef() || MOB.isDef()))
         return false;
@@ -517,7 +517,7 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
   //
   // we must ensure that there are no instructions between the 'test' and
   // conditional jump that modify %rax.
-  const unsigned PointerReg = MBP.LHS.getReg();
+  const Register PointerReg = MBP.LHS.getReg();
 
   assert(MBP.ConditionDef->getParent() ==  &MBB && "Should be in basic block");
 
@@ -689,7 +689,7 @@ void ImplicitNullChecks::rewriteNullChecks(
     for (const MachineOperand &MO : FaultingInstr->operands()) {
       if (!MO.isReg() || !MO.isDef())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg || MBB->isLiveIn(Reg))
         continue;
       MBB->addLiveIn(Reg);
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 41ae8061a917..2408f18678e4 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "LiveRangeCalc.h"
 #include "Spiller.h"
 #include "SplitKit.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -26,6 +25,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRangeCalc.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -346,8 +346,7 @@ void InlineSpiller::collectRegsToSpill() {
 }
 
 bool InlineSpiller::isSibling(unsigned Reg) {
-  return TargetRegisterInfo::isVirtualRegister(Reg) &&
-           VRM.getOriginal(Reg) == Original;
+  return Register::isVirtualRegister(Reg) && VRM.getOriginal(Reg) == Original;
 }
 
 /// It is beneficial to spill to earlier place in the same BB in case
@@ -377,7 +376,7 @@ bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
   assert(VNI && VNI->def == Idx.getRegSlot() && "Not defined by copy");
 #endif
 
-  unsigned SrcReg = CopyMI.getOperand(1).getReg();
+  Register SrcReg = CopyMI.getOperand(1).getReg();
   LiveInterval &SrcLI = LIS.getInterval(SrcReg);
   VNInfo *SrcVNI = SrcLI.getVNInfoAt(Idx);
   LiveQueryResult SrcQ = SrcLI.Query(Idx);
@@ -845,9 +844,8 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
   for (MIBundleOperands MO(*MI); MO.isValid(); ++MO) {
     if (!MO->isReg())
       continue;
-    unsigned Reg = MO->getReg();
-    if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) ||
-        MRI.isReserved(Reg)) {
+    Register Reg = MO->getReg();
+    if (!Reg || Register::isVirtualRegister(Reg) || MRI.isReserved(Reg)) {
       continue;
     }
     // Skip non-Defs, including undef uses and internal reads.
@@ -869,7 +867,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
     --NumSpills;
   LIS.ReplaceMachineInstrInMaps(*MI, *FoldMI);
   if (MI->isCall())
-    MI->getMF()->updateCallSiteInfo(MI, FoldMI);
+    MI->getMF()->moveCallSiteInfo(MI, FoldMI);
   MI->eraseFromParent();
 
   // Insert any new instructions other than FoldMI into the LIS maps.
@@ -1111,8 +1109,8 @@ void InlineSpiller::spillAll() {
 void InlineSpiller::spill(LiveRangeEdit &edit) {
   ++NumSpilledRanges;
   Edit = &edit;
-  assert(!TargetRegisterInfo::isStackSlot(edit.getReg())
-         && "Trying to spill a stack slot.");
+  assert(!Register::isStackSlot(edit.getReg()) &&
+         "Trying to spill a stack slot.");
   // Share a stack slot among all descendants of Original.
   Original = VRM.getOriginal(edit.getReg());
   StackSlot = VRM.getStackSlot(Original);
@@ -1147,7 +1145,7 @@ void HoistSpillHelper::addToMergeableSpills(MachineInstr &Spill, int StackSlot,
   // save a copy of LiveInterval in StackSlotToOrigLI because the original
   // LiveInterval may be cleared after all its references are spilled.
   if (StackSlotToOrigLI.find(StackSlot) == StackSlotToOrigLI.end()) {
-    auto LI = llvm::make_unique<LiveInterval>(OrigLI.reg, OrigLI.weight);
+    auto LI = std::make_unique<LiveInterval>(OrigLI.reg, OrigLI.weight);
     LI->assign(OrigLI, Allocator);
     StackSlotToOrigLI[StackSlot] = std::move(LI);
   }
@@ -1459,7 +1457,7 @@ void HoistSpillHelper::hoistAllSpills() {
   LiveRangeEdit Edit(nullptr, NewVRegs, MF, LIS, &VRM, this);
 
   for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     unsigned Original = VRM.getPreSplitReg(Reg);
     if (!MRI.def_empty(Reg))
       Virt2SiblingsMap[Original].insert(Reg);
diff --git a/lib/CodeGen/InterleavedLoadCombinePass.cpp b/lib/CodeGen/InterleavedLoadCombinePass.cpp
index 9525da849e2a..770c4952d169 100644
--- a/lib/CodeGen/InterleavedLoadCombinePass.cpp
+++ b/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -940,8 +940,8 @@ public:
   /// \param V input value
   /// \param Result result polynomial
   static void computePolynomial(Value &V, Polynomial &Result) {
-    if (isa<BinaryOperator>(&V))
-      computePolynomialBinOp(*dyn_cast<BinaryOperator>(&V), Result);
+    if (auto *BO = dyn_cast<BinaryOperator>(&V))
+      computePolynomialBinOp(*BO, Result);
     else
       Result = Polynomial(&V);
   }
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 886ae7e94adb..1c362aec6e67 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -96,14 +96,15 @@ LLVMTargetMachine::getTargetTransformInfo(const Function &F) {
 /// addPassesToX helper drives creation and initialization of TargetPassConfig.
 static TargetPassConfig *
 addPassesToGenerateCode(LLVMTargetMachine &TM, PassManagerBase &PM,
-                        bool DisableVerify, MachineModuleInfo &MMI) {
+                        bool DisableVerify,
+                        MachineModuleInfoWrapperPass &MMIWP) {
   // Targets may override createPassConfig to provide a target-specific
   // subclass.
   TargetPassConfig *PassConfig = TM.createPassConfig(PM);
   // Set PassConfig options provided by TargetMachine.
   PassConfig->setDisableVerify(DisableVerify);
   PM.add(PassConfig);
-  PM.add(&MMI);
+  PM.add(&MMIWP);
 
   if (PassConfig->addISelPasses())
     return nullptr;
@@ -139,7 +140,7 @@ bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM,
 
     std::unique_ptr<MCAsmBackend> MAB(
         getTarget().createMCAsmBackend(STI, MRI, Options.MCOptions));
-    auto FOut = llvm::make_unique<formatted_raw_ostream>(Out);
+    auto FOut = std::make_unique<formatted_raw_ostream>(Out);
     MCStreamer *S = getTarget().createAsmStreamer(
         Context, std::move(FOut), Options.MCOptions.AsmVerbose,
         Options.MCOptions.MCUseDwarfDirectory, InstPrinter, std::move(MCE),
@@ -186,17 +187,15 @@ bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM,
   return false;
 }
 
-bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
-                                            raw_pwrite_stream &Out,
-                                            raw_pwrite_stream *DwoOut,
-                                            CodeGenFileType FileType,
-                                            bool DisableVerify,
-                                            MachineModuleInfo *MMI) {
+bool LLVMTargetMachine::addPassesToEmitFile(
+    PassManagerBase &PM, raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
+    CodeGenFileType FileType, bool DisableVerify,
+    MachineModuleInfoWrapperPass *MMIWP) {
   // Add common CodeGen passes.
-  if (!MMI)
-    MMI = new MachineModuleInfo(this);
+  if (!MMIWP)
+    MMIWP = new MachineModuleInfoWrapperPass(this);
   TargetPassConfig *PassConfig =
-      addPassesToGenerateCode(*this, PM, DisableVerify, *MMI);
+      addPassesToGenerateCode(*this, PM, DisableVerify, *MMIWP);
   if (!PassConfig)
     return true;
 
@@ -206,12 +205,13 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
       // testing to be meaningful, we need to ensure that the symbols created
       // are MCSymbolXCOFF variants, which requires that
       // the TargetLoweringObjectFile instance has been initialized.
-      MCContext &Ctx = MMI->getContext();
+      MCContext &Ctx = MMIWP->getMMI().getContext();
       const_cast<TargetLoweringObjectFile &>(*this->getObjFileLowering())
           .Initialize(Ctx, *this);
     }
     PM.add(createPrintMIRPass(Out));
-  } else if (addAsmPrinter(PM, Out, DwoOut, FileType, MMI->getContext()))
+  } else if (addAsmPrinter(PM, Out, DwoOut, FileType,
+                           MMIWP->getMMI().getContext()))
     return true;
 
   PM.add(createFreeMachineFunctionPass());
@@ -227,15 +227,15 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx,
                                           raw_pwrite_stream &Out,
                                           bool DisableVerify) {
   // Add common CodeGen passes.
-  MachineModuleInfo *MMI = new MachineModuleInfo(this);
+  MachineModuleInfoWrapperPass *MMIWP = new MachineModuleInfoWrapperPass(this);
   TargetPassConfig *PassConfig =
-      addPassesToGenerateCode(*this, PM, DisableVerify, *MMI);
+      addPassesToGenerateCode(*this, PM, DisableVerify, *MMIWP);
   if (!PassConfig)
     return true;
   assert(TargetPassConfig::willCompleteCodeGenPipeline() &&
          "Cannot emit MC with limited codegen pipeline");
 
-  Ctx = &MMI->getContext();
+  Ctx = &MMIWP->getMMI().getContext();
   if (Options.MCOptions.MCSaveTempLabels)
     Ctx->setAllowTemporaryLabels(false);
 
diff --git a/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp b/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
index 200ac0ba15bf..cef5085ae079 100644
--- a/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
+++ b/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
@@ -73,18 +73,18 @@ LazyMachineBlockFrequencyInfoPass::calculateIfNotAvailable() const {
 
     if (!MDT) {
       LLVM_DEBUG(dbgs() << "Building DominatorTree on the fly\n");
-      OwnedMDT = make_unique<MachineDominatorTree>();
+      OwnedMDT = std::make_unique<MachineDominatorTree>();
       OwnedMDT->getBase().recalculate(*MF);
       MDT = OwnedMDT.get();
     }
 
     // Generate LoopInfo from it.
-    OwnedMLI = make_unique<MachineLoopInfo>();
+    OwnedMLI = std::make_unique<MachineLoopInfo>();
     OwnedMLI->getBase().analyze(MDT->getBase());
     MLI = OwnedMLI.get();
   }
 
-  OwnedMBFI = make_unique<MachineBlockFrequencyInfo>();
+  OwnedMBFI = std::make_unique<MachineBlockFrequencyInfo>();
   OwnedMBFI->calculate(*MF, MBPI, *MLI);
   return *OwnedMBFI.get();
 }
diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp
index 503821537ed9..ac3ef0e709f3 100644
--- a/lib/CodeGen/LexicalScopes.cpp
+++ b/lib/CodeGen/LexicalScopes.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
diff --git a/lib/CodeGen/LiveDebugValues.cpp b/lib/CodeGen/LiveDebugValues.cpp
index a669e64692b9..f1b237d83e8c 100644
--- a/lib/CodeGen/LiveDebugValues.cpp
+++ b/lib/CodeGen/LiveDebugValues.cpp
@@ -7,14 +7,23 @@
 //===----------------------------------------------------------------------===//
 ///
 /// This pass implements a data flow analysis that propagates debug location
-/// information by inserting additional DBG_VALUE instructions into the machine
-/// instruction stream. The pass internally builds debug location liveness
-/// ranges to determine the points where additional DBG_VALUEs need to be
-/// inserted.
+/// information by inserting additional DBG_VALUE insts into the machine
+/// instruction stream. Before running, each DBG_VALUE inst corresponds to a
+/// source assignment of a variable. Afterwards, a DBG_VALUE inst specifies a
+/// variable location for the current basic block (see SourceLevelDebugging.rst).
 ///
 /// This is a separate pass from DbgValueHistoryCalculator to facilitate
 /// testing and improve modularity.
 ///
+/// Each variable location is represented by a VarLoc object that identifies the
+/// source variable, its current machine-location, and the DBG_VALUE inst that
+/// specifies the location. Each VarLoc is indexed in the (function-scope)
+/// VarLocMap, giving each VarLoc a unique index. Rather than operate directly
+/// on machine locations, the dataflow analysis in this pass identifies
+/// locations by their index in the VarLocMap, meaning all the variable
+/// locations in a block can be described by a sparse vector of VarLocMap
+/// indexes.
+///
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DenseMap.h"
@@ -68,6 +77,7 @@ using namespace llvm;
 #define DEBUG_TYPE "livedebugvalues"
 
 STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
+STATISTIC(NumRemoved, "Number of DBG_VALUE instructions removed");
 
 // If @MI is a DBG_VALUE with debug value described by a defined
 // register, returns the number of this register. In the other case, returns 0.
@@ -179,8 +189,16 @@ private:
       }
     };
 
+    /// Identity of the variable at this location.
     const DebugVariable Var;
-    const MachineInstr &MI; ///< Only used for cloning a new DBG_VALUE.
+
+    /// The expression applied to this location.
+    const DIExpression *Expr;
+
+    /// DBG_VALUE to clone var/expr information from if this location
+    /// is moved.
+    const MachineInstr &MI;
+
     mutable UserValueScopes UVS;
     enum VarLocKind {
       InvalidKind = 0,
@@ -201,9 +219,9 @@ private:
       const ConstantInt *CImm;
     } Loc;
 
-    VarLoc(const MachineInstr &MI, LexicalScopes &LS,
-          VarLocKind K = InvalidKind)
-        : Var(MI), MI(MI), UVS(MI.getDebugLoc(), LS){
+    VarLoc(const MachineInstr &MI, LexicalScopes &LS)
+        : Var(MI), Expr(MI.getDebugExpression()), MI(MI),
+          UVS(MI.getDebugLoc(), LS) {
       static_assert((sizeof(Loc) == sizeof(uint64_t)),
                     "hash does not cover all members of Loc");
       assert(MI.isDebugValue() && "not a DBG_VALUE");
@@ -225,17 +243,78 @@ private:
              "entry values must be register locations");
     }
 
-    /// The constructor for spill locations.
-    VarLoc(const MachineInstr &MI, unsigned SpillBase, int SpillOffset,
-           LexicalScopes &LS)
-        : Var(MI), MI(MI), UVS(MI.getDebugLoc(), LS) {
-      assert(MI.isDebugValue() && "not a DBG_VALUE");
-      assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE");
-      Kind = SpillLocKind;
-      Loc.SpillLocation = {SpillBase, SpillOffset};
+    /// Take the variable and machine-location in DBG_VALUE MI, and build an
+    /// entry location using the given expression.
+    static VarLoc CreateEntryLoc(const MachineInstr &MI, LexicalScopes &LS,
+                                 const DIExpression *EntryExpr) {
+      VarLoc VL(MI, LS);
+      VL.Kind = EntryValueKind;
+      VL.Expr = EntryExpr;
+      return VL;
+    }
+
+    /// Copy the register location in DBG_VALUE MI, updating the register to
+    /// be NewReg.
+    static VarLoc CreateCopyLoc(const MachineInstr &MI, LexicalScopes &LS,
+                                unsigned NewReg) {
+      VarLoc VL(MI, LS);
+      assert(VL.Kind == RegisterKind);
+      VL.Loc.RegNo = NewReg;
+      return VL;
+    }
+
+    /// Take the variable described by DBG_VALUE MI, and create a VarLoc
+    /// locating it in the specified spill location.
+    static VarLoc CreateSpillLoc(const MachineInstr &MI, unsigned SpillBase,
+                                 int SpillOffset, LexicalScopes &LS) {
+      VarLoc VL(MI, LS);
+      assert(VL.Kind == RegisterKind);
+      VL.Kind = SpillLocKind;
+      VL.Loc.SpillLocation = {SpillBase, SpillOffset};
+      return VL;
     }
 
-    // Is the Loc field a constant or constant object?
+    /// Create a DBG_VALUE representing this VarLoc in the given function.
+    /// Copies variable-specific information such as DILocalVariable and
+    /// inlining information from the original DBG_VALUE instruction, which may
+    /// have been several transfers ago.
+    MachineInstr *BuildDbgValue(MachineFunction &MF) const {
+      const DebugLoc &DbgLoc = MI.getDebugLoc();
+      bool Indirect = MI.isIndirectDebugValue();
+      const auto &IID = MI.getDesc();
+      const DILocalVariable *Var = MI.getDebugVariable();
+      const DIExpression *DIExpr = MI.getDebugExpression();
+
+      switch (Kind) {
+      case EntryValueKind:
+        // An entry value is a register location -- but with an updated
+        // expression.
+        return BuildMI(MF, DbgLoc, IID, Indirect, Loc.RegNo, Var, Expr);
+      case RegisterKind:
+        // Register locations are like the source DBG_VALUE, but with the
+        // register number from this VarLoc.
+        return BuildMI(MF, DbgLoc, IID, Indirect, Loc.RegNo, Var, DIExpr);
+      case SpillLocKind: {
+        // Spills are indirect DBG_VALUEs, with a base register and offset.
+        // Use the original DBG_VALUEs expression to build the spilt location
+        // on top of. FIXME: spill locations created before this pass runs
+        // are not recognized, and not handled here.
+        auto *SpillExpr = DIExpression::prepend(
+            DIExpr, DIExpression::ApplyOffset, Loc.SpillLocation.SpillOffset);
+        unsigned Base = Loc.SpillLocation.SpillBase;
+        return BuildMI(MF, DbgLoc, IID, true, Base, Var, SpillExpr);
+      }
+      case ImmediateKind: {
+        MachineOperand MO = MI.getOperand(0);
+        return BuildMI(MF, DbgLoc, IID, Indirect, MO, Var, DIExpr);
+      }
+      case InvalidKind:
+        llvm_unreachable("Tried to produce DBG_VALUE for invalid VarLoc");
+      }
+      llvm_unreachable("Unrecognized LiveDebugValues.VarLoc.Kind enum");
+    }
+
+    /// Is the Loc field a constant or constant object?
     bool isConstant() const { return Kind == ImmediateKind; }
 
     /// If this variable is described by a register, return it,
@@ -251,18 +330,42 @@ private:
     bool dominates(MachineBasicBlock &MBB) const { return UVS.dominates(&MBB); }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-    LLVM_DUMP_METHOD void dump() const { MI.dump(); }
+    // TRI can be null.
+    void dump(const TargetRegisterInfo *TRI, raw_ostream &Out = dbgs()) const {
+      dbgs() << "VarLoc(";
+      switch (Kind) {
+      case RegisterKind:
+      case EntryValueKind:
+        dbgs() << printReg(Loc.RegNo, TRI);
+        break;
+      case SpillLocKind:
+        dbgs() << printReg(Loc.SpillLocation.SpillBase, TRI);
+        dbgs() << "[" << Loc.SpillLocation.SpillOffset << "]";
+        break;
+      case ImmediateKind:
+        dbgs() << Loc.Immediate;
+        break;
+      case InvalidKind:
+        llvm_unreachable("Invalid VarLoc in dump method");
+      }
+
+      dbgs() << ", \"" << Var.getVar()->getName() << "\", " << *Expr << ", ";
+      if (Var.getInlinedAt())
+        dbgs() << "!" << Var.getInlinedAt()->getMetadataID() << ")\n";
+      else
+        dbgs() << "(null))\n";
+    }
 #endif
 
     bool operator==(const VarLoc &Other) const {
       return Kind == Other.Kind && Var == Other.Var &&
-             Loc.Hash == Other.Loc.Hash;
+             Loc.Hash == Other.Loc.Hash && Expr == Other.Expr;
     }
 
     /// This operator guarantees that VarLocs are sorted by Variable first.
     bool operator<(const VarLoc &Other) const {
-      return std::tie(Var, Kind, Loc.Hash) <
-             std::tie(Other.Var, Other.Kind, Other.Loc.Hash);
+      return std::tie(Var, Kind, Loc.Hash, Expr) <
+             std::tie(Other.Var, Other.Kind, Other.Loc.Hash, Other.Expr);
     }
   };
 
@@ -271,8 +374,8 @@ private:
   using VarLocSet = SparseBitVector<>;
   using VarLocInMBB = SmallDenseMap<const MachineBasicBlock *, VarLocSet>;
   struct TransferDebugPair {
-    MachineInstr *TransferInst;
-    MachineInstr *DebugInst;
+    MachineInstr *TransferInst; /// Instruction where this transfer occurs.
+    unsigned LocationID;        /// Location number for the transfer dest.
   };
   using TransferMap = SmallVector<TransferDebugPair, 4>;
 
@@ -320,6 +423,14 @@ private:
       Vars.insert({Var, VarLocID});
     }
 
+    /// Insert a set of ranges.
+    void insertFromLocSet(const VarLocSet &ToLoad, const VarLocMap &Map) {
+      for (unsigned Id : ToLoad) {
+        const VarLoc &Var = Map[Id];
+        insert(Id, Var.Var);
+      }
+    }
+
     /// Empty the set.
     void clear() {
       VarLocs.clear();
@@ -333,8 +444,18 @@ private:
     }
   };
 
-  bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF,
-                          unsigned &Reg);
+  /// Tests whether this instruction is a spill to a stack location.
+  bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF);
+
+  /// Decide if @MI is a spill instruction and return true if it is. We use 2
+  /// criteria to make this decision:
+  /// - Is this instruction a store to a spill slot?
+  /// - Is there a register operand that is both used and killed?
+  /// TODO: Store optimization can fold spills into other stores (including
+  /// other spills). We do not handle this yet (more than one memory operand).
+  bool isLocationSpill(const MachineInstr &MI, MachineFunction *MF,
+                       unsigned &Reg);
+
   /// If a given instruction is identified as a spill, return the spill location
   /// and set \p Reg to the spilled register.
   Optional<VarLoc::SpillLoc> isRestoreInstruction(const MachineInstr &MI,
@@ -361,13 +482,13 @@ private:
   void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges,
                            VarLocMap &VarLocIDs, TransferMap &Transfers,
                            DebugParamMap &DebugEntryVals);
-  bool transferTerminatorInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                              VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs);
+  bool transferTerminator(MachineBasicBlock *MBB, OpenRangesSet &OpenRanges,
+                          VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs);
 
-  bool process(MachineInstr &MI, OpenRangesSet &OpenRanges,
+  void process(MachineInstr &MI, OpenRangesSet &OpenRanges,
                VarLocInMBB &OutLocs, VarLocMap &VarLocIDs,
                TransferMap &Transfers, DebugParamMap &DebugEntryVals,
-               bool transferChanges, OverlapMap &OverlapFragments,
+               OverlapMap &OverlapFragments,
                VarToFragments &SeenFragments);
 
   void accumulateFragmentMap(MachineInstr &MI, VarToFragments &SeenFragments,
@@ -376,7 +497,12 @@ private:
   bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
             const VarLocMap &VarLocIDs,
             SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
-            SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks);
+            SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks,
+            VarLocInMBB &PendingInLocs);
+
+  /// Create DBG_VALUE insts for inlocs that have been propagated but
+  /// had their instruction creation deferred.
+  void flushPendingLocs(VarLocInMBB &PendingInLocs, VarLocMap &VarLocIDs);
 
   bool ExtendRanges(MachineFunction &MF);
 
@@ -518,7 +644,7 @@ void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF,
       const VarLoc &VL = VarLocIDs[VLL];
       Out << " Var: " << VL.Var.getVar()->getName();
       Out << " MI: ";
-      VL.dump();
+      VL.dump(TRI, Out);
     }
   }
   Out << "\n";
@@ -567,11 +693,7 @@ void LiveDebugValues::transferDebugValue(const MachineInstr &MI,
     ID = VarLocIDs.insert(VL);
     OpenRanges.insert(ID, VL.Var);
   } else if (MI.hasOneMemOperand()) {
-    // It's a stack spill -- fetch spill base and offset.
-    VarLoc::SpillLoc SpillLocation = extractSpillBaseRegAndOffset(MI);
-    VarLoc VL(MI, SpillLocation.SpillBase, SpillLocation.SpillOffset, LS);
-    ID = VarLocIDs.insert(VL);
-    OpenRanges.insert(ID, VL.Var);
+    llvm_unreachable("DBG_VALUE with mem operand encountered after regalloc?");
   } else {
     // This must be an undefined location. We should leave OpenRanges closed.
     assert(MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == 0 &&
@@ -585,7 +707,6 @@ void LiveDebugValues::emitEntryValues(MachineInstr &MI,
                                       TransferMap &Transfers,
                                       DebugParamMap &DebugEntryVals,
                                       SparseBitVector<> &KillSet) {
-  MachineFunction *MF = MI.getParent()->getParent();
   for (unsigned ID : KillSet) {
     if (!VarLocIDs[ID].Var.getVar()->isParameter())
       continue;
@@ -600,20 +721,12 @@ void LiveDebugValues::emitEntryValues(MachineInstr &MI,
     auto ParamDebugInstr = DebugEntryVals[CurrDebugInstr->getDebugVariable()];
     DIExpression *NewExpr = DIExpression::prepend(
         ParamDebugInstr->getDebugExpression(), DIExpression::EntryValue);
-    MachineInstr *EntryValDbgMI =
-        BuildMI(*MF, ParamDebugInstr->getDebugLoc(), ParamDebugInstr->getDesc(),
-                ParamDebugInstr->isIndirectDebugValue(),
-                ParamDebugInstr->getOperand(0).getReg(),
-                ParamDebugInstr->getDebugVariable(), NewExpr);
-
-    if (ParamDebugInstr->isIndirectDebugValue())
-      EntryValDbgMI->getOperand(1).setImm(
-          ParamDebugInstr->getOperand(1).getImm());
-
-    Transfers.push_back({&MI, EntryValDbgMI});
-    VarLoc VL(*EntryValDbgMI, LS);
-    unsigned EntryValLocID = VarLocIDs.insert(VL);
-    OpenRanges.insert(EntryValLocID, VL.Var);
+
+    VarLoc EntryLoc = VarLoc::CreateEntryLoc(*ParamDebugInstr, LS, NewExpr);
+
+    unsigned EntryValLocID = VarLocIDs.insert(EntryLoc);
+    Transfers.push_back({&MI, EntryValLocID});
+    OpenRanges.insert(EntryValLocID, EntryLoc.Var);
   }
 }
 
@@ -627,21 +740,19 @@ void LiveDebugValues::insertTransferDebugPair(
     VarLocMap &VarLocIDs, unsigned OldVarID, TransferKind Kind,
     unsigned NewReg) {
   const MachineInstr *DebugInstr = &VarLocIDs[OldVarID].MI;
-  MachineFunction *MF = MI.getParent()->getParent();
-  MachineInstr *NewDebugInstr;
 
   auto ProcessVarLoc = [&MI, &OpenRanges, &Transfers, &DebugInstr,
-                        &VarLocIDs](VarLoc &VL, MachineInstr *NewDebugInstr) {
+                        &VarLocIDs](VarLoc &VL) {
     unsigned LocId = VarLocIDs.insert(VL);
 
     // Close this variable's previous location range.
     DebugVariable V(*DebugInstr);
     OpenRanges.erase(V);
 
+    // Record the new location as an open range, and a postponed transfer
+    // inserting a DBG_VALUE for this location.
     OpenRanges.insert(LocId, VL.Var);
-    // The newly created DBG_VALUE instruction NewDebugInstr must be inserted
-    // after MI. Keep track of the pairing.
-    TransferDebugPair MIP = {&MI, NewDebugInstr};
+    TransferDebugPair MIP = {&MI, LocId};
     Transfers.push_back(MIP);
   };
 
@@ -653,37 +764,25 @@ void LiveDebugValues::insertTransferDebugPair(
            "No register supplied when handling a copy of a debug value");
     // Create a DBG_VALUE instruction to describe the Var in its new
     // register location.
-    NewDebugInstr = BuildMI(
-        *MF, DebugInstr->getDebugLoc(), DebugInstr->getDesc(),
-        DebugInstr->isIndirectDebugValue(), NewReg,
-        DebugInstr->getDebugVariable(), DebugInstr->getDebugExpression());
-    if (DebugInstr->isIndirectDebugValue())
-      NewDebugInstr->getOperand(1).setImm(DebugInstr->getOperand(1).getImm());
-    VarLoc VL(*NewDebugInstr, LS);
-    ProcessVarLoc(VL, NewDebugInstr);
-    LLVM_DEBUG(dbgs() << "Creating DBG_VALUE inst for register copy: ";
-               NewDebugInstr->print(dbgs(), /*IsStandalone*/false,
-                                    /*SkipOpers*/false, /*SkipDebugLoc*/false,
-                                    /*AddNewLine*/true, TII));
+    VarLoc VL = VarLoc::CreateCopyLoc(*DebugInstr, LS, NewReg);
+    ProcessVarLoc(VL);
+    LLVM_DEBUG({
+      dbgs() << "Creating VarLoc for register copy:";
+      VL.dump(TRI);
+    });
     return;
   }
   case TransferKind::TransferSpill: {
     // Create a DBG_VALUE instruction to describe the Var in its spilled
     // location.
     VarLoc::SpillLoc SpillLocation = extractSpillBaseRegAndOffset(MI);
-    auto *SpillExpr = DIExpression::prepend(DebugInstr->getDebugExpression(),
-                                            DIExpression::ApplyOffset,
-                                            SpillLocation.SpillOffset);
-    NewDebugInstr = BuildMI(
-        *MF, DebugInstr->getDebugLoc(), DebugInstr->getDesc(), true,
-        SpillLocation.SpillBase, DebugInstr->getDebugVariable(), SpillExpr);
-    VarLoc VL(*NewDebugInstr, SpillLocation.SpillBase,
-              SpillLocation.SpillOffset, LS);
-    ProcessVarLoc(VL, NewDebugInstr);
-    LLVM_DEBUG(dbgs() << "Creating DBG_VALUE inst for spill: ";
-               NewDebugInstr->print(dbgs(), /*IsStandalone*/false,
-                                    /*SkipOpers*/false, /*SkipDebugLoc*/false,
-                                    /*AddNewLine*/true, TII));
+    VarLoc VL = VarLoc::CreateSpillLoc(*DebugInstr, SpillLocation.SpillBase,
+                                       SpillLocation.SpillOffset, LS);
+    ProcessVarLoc(VL);
+    LLVM_DEBUG({
+      dbgs() << "Creating VarLoc for spill:";
+      VL.dump(TRI);
+    });
     return;
   }
   case TransferKind::TransferRestore: {
@@ -691,15 +790,14 @@ void LiveDebugValues::insertTransferDebugPair(
            "No register supplied when handling a restore of a debug value");
     MachineFunction *MF = MI.getMF();
     DIBuilder DIB(*const_cast<Function &>(MF->getFunction()).getParent());
-    NewDebugInstr =
-        BuildMI(*MF, DebugInstr->getDebugLoc(), DebugInstr->getDesc(), false,
-                NewReg, DebugInstr->getDebugVariable(), DIB.createExpression());
-    VarLoc VL(*NewDebugInstr, LS);
-    ProcessVarLoc(VL, NewDebugInstr);
-    LLVM_DEBUG(dbgs() << "Creating DBG_VALUE inst for register restore: ";
-               NewDebugInstr->print(dbgs(), /*IsStandalone*/false,
-                                    /*SkipOpers*/false, /*SkipDebugLoc*/false,
-                                    /*AddNewLine*/true, TII));
+    // DebugInstr refers to the pre-spill location, therefore we can reuse
+    // its expression.
+    VarLoc VL = VarLoc::CreateCopyLoc(*DebugInstr, LS, NewReg);
+    ProcessVarLoc(VL);
+    LLVM_DEBUG({
+      dbgs() << "Creating VarLoc for restore:";
+      VL.dump(TRI);
+    });
     return;
   }
   }
@@ -719,7 +817,7 @@ void LiveDebugValues::transferRegisterDef(
     // instructions never clobber SP, because some backends (e.g., AArch64)
     // never list SP in the regmask.
     if (MO.isReg() && MO.isDef() && MO.getReg() &&
-        TRI->isPhysicalRegister(MO.getReg()) &&
+        Register::isPhysicalRegister(MO.getReg()) &&
         !(MI.isCall() && MO.getReg() == SP)) {
       // Remove ranges of all aliased registers.
       for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
@@ -748,16 +846,8 @@ void LiveDebugValues::transferRegisterDef(
   }
 }
 
-/// Decide if @MI is a spill instruction and return true if it is. We use 2
-/// criteria to make this decision:
-/// - Is this instruction a store to a spill slot?
-/// - Is there a register operand that is both used and killed?
-/// TODO: Store optimization can fold spills into other stores (including
-/// other spills). We do not handle this yet (more than one memory operand).
 bool LiveDebugValues::isSpillInstruction(const MachineInstr &MI,
-                                         MachineFunction *MF, unsigned &Reg) {
-  SmallVector<const MachineMemOperand*, 1> Accesses;
-
+                                         MachineFunction *MF) {
   // TODO: Handle multiple stores folded into one.
   if (!MI.hasOneMemOperand())
     return false;
@@ -766,6 +856,14 @@ bool LiveDebugValues::isSpillInstruction(const MachineInstr &MI,
     return false; // This is not a spill instruction, since no valid size was
                   // returned from either function.
 
+  return true;
+}
+
+bool LiveDebugValues::isLocationSpill(const MachineInstr &MI,
+                                      MachineFunction *MF, unsigned &Reg) {
+  if (!isSpillInstruction(MI, MF))
+    return false;
+
   auto isKilledReg = [&](const MachineOperand MO, unsigned &Reg) {
     if (!MO.isReg() || !MO.isUse()) {
       Reg = 0;
@@ -834,7 +932,37 @@ void LiveDebugValues::transferSpillOrRestoreInst(MachineInstr &MI,
 
   LLVM_DEBUG(dbgs() << "Examining instruction: "; MI.dump(););
 
-  if (isSpillInstruction(MI, MF, Reg)) {
+  // First, if there are any DBG_VALUEs pointing at a spill slot that is
+  // written to, then close the variable location. The value in memory
+  // will have changed.
+  VarLocSet KillSet;
+  if (isSpillInstruction(MI, MF)) {
+    Loc = extractSpillBaseRegAndOffset(MI);
+    for (unsigned ID : OpenRanges.getVarLocs()) {
+      const VarLoc &VL = VarLocIDs[ID];
+      if (VL.Kind == VarLoc::SpillLocKind && VL.Loc.SpillLocation == *Loc) {
+        // This location is overwritten by the current instruction -- terminate
+        // the open range, and insert an explicit DBG_VALUE $noreg.
+        //
+        // Doing this at a later stage would require re-interpreting all
+        // DBG_VALUes and DIExpressions to identify whether they point at
+        // memory, and then analysing all memory writes to see if they
+        // overwrite that memory, which is expensive.
+        //
+        // At this stage, we already know which DBG_VALUEs are for spills and
+        // where they are located; it's best to fix handle overwrites now.
+        KillSet.set(ID);
+        VarLoc UndefVL = VarLoc::CreateCopyLoc(VL.MI, LS, 0);
+        unsigned UndefLocID = VarLocIDs.insert(UndefVL);
+        Transfers.push_back({&MI, UndefLocID});
+      }
+    }
+    OpenRanges.erase(KillSet, VarLocIDs);
+  }
+
+  // Try to recognise spill and restore instructions that may create a new
+  // variable location.
+  if (isLocationSpill(MI, MF, Reg)) {
     TKind = TransferKind::TransferSpill;
     LLVM_DEBUG(dbgs() << "Recognized as spill: "; MI.dump(););
     LLVM_DEBUG(dbgs() << "Register: " << Reg << " " << printReg(Reg, TRI)
@@ -854,6 +982,7 @@ void LiveDebugValues::transferSpillOrRestoreInst(MachineInstr &MI,
       LLVM_DEBUG(dbgs() << "Spilling Register " << printReg(Reg, TRI) << '('
                         << VarLocIDs[ID].Var.getVar()->getName() << ")\n");
     } else if (TKind == TransferKind::TransferRestore &&
+               VarLocIDs[ID].Kind == VarLoc::SpillLocKind &&
                VarLocIDs[ID].Loc.SpillLocation == *Loc) {
       LLVM_DEBUG(dbgs() << "Restoring Register " << printReg(Reg, TRI) << '('
                         << VarLocIDs[ID].Var.getVar()->getName() << ")\n");
@@ -885,8 +1014,8 @@ void LiveDebugValues::transferRegisterCopy(MachineInstr &MI,
     return false;
   };
 
-  unsigned SrcReg = SrcRegOp->getReg();
-  unsigned DestReg = DestRegOp->getReg();
+  Register SrcReg = SrcRegOp->getReg();
+  Register DestReg = DestRegOp->getReg();
 
   // We want to recognize instructions where destination register is callee
   // saved register. If register that could be clobbered by the call is
@@ -906,26 +1035,20 @@ void LiveDebugValues::transferRegisterCopy(MachineInstr &MI,
 }
 
 /// Terminate all open ranges at the end of the current basic block.
-bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI,
-                                             OpenRangesSet &OpenRanges,
-                                             VarLocInMBB &OutLocs,
-                                             const VarLocMap &VarLocIDs) {
+bool LiveDebugValues::transferTerminator(MachineBasicBlock *CurMBB,
+                                         OpenRangesSet &OpenRanges,
+                                         VarLocInMBB &OutLocs,
+                                         const VarLocMap &VarLocIDs) {
   bool Changed = false;
-  const MachineBasicBlock *CurMBB = MI.getParent();
-  if (!(MI.isTerminator() || (&MI == &CurMBB->back())))
-    return false;
-
-  if (OpenRanges.empty())
-    return false;
 
   LLVM_DEBUG(for (unsigned ID
                   : OpenRanges.getVarLocs()) {
     // Copy OpenRanges to OutLocs, if not already present.
     dbgs() << "Add to OutLocs in MBB #" << CurMBB->getNumber() << ":  ";
-    VarLocIDs[ID].dump();
+    VarLocIDs[ID].dump(TRI);
   });
   VarLocSet &VLS = OutLocs[CurMBB];
-  Changed = VLS |= OpenRanges.getVarLocs();
+  Changed = VLS != OpenRanges.getVarLocs();
   // New OutLocs set may be different due to spill, restore or register
   // copy instruction processing.
   if (Changed)
@@ -995,26 +1118,17 @@ void LiveDebugValues::accumulateFragmentMap(MachineInstr &MI,
 }
 
 /// This routine creates OpenRanges and OutLocs.
-bool LiveDebugValues::process(MachineInstr &MI, OpenRangesSet &OpenRanges,
+void LiveDebugValues::process(MachineInstr &MI, OpenRangesSet &OpenRanges,
                               VarLocInMBB &OutLocs, VarLocMap &VarLocIDs,
-                              TransferMap &Transfers, DebugParamMap &DebugEntryVals,
-                              bool transferChanges,
+                              TransferMap &Transfers,
+                              DebugParamMap &DebugEntryVals,
                               OverlapMap &OverlapFragments,
                               VarToFragments &SeenFragments) {
-  bool Changed = false;
   transferDebugValue(MI, OpenRanges, VarLocIDs);
   transferRegisterDef(MI, OpenRanges, VarLocIDs, Transfers,
                       DebugEntryVals);
-  if (transferChanges) {
-    transferRegisterCopy(MI, OpenRanges, VarLocIDs, Transfers);
-    transferSpillOrRestoreInst(MI, OpenRanges, VarLocIDs, Transfers);
-  } else {
-    // Build up a map of overlapping fragments on the first run through.
-    if (MI.isDebugValue())
-      accumulateFragmentMap(MI, SeenFragments, OverlapFragments);
-  }
-  Changed = transferTerminatorInst(MI, OpenRanges, OutLocs, VarLocIDs);
-  return Changed;
+  transferRegisterCopy(MI, OpenRanges, VarLocIDs, Transfers);
+  transferSpillOrRestoreInst(MI, OpenRanges, VarLocIDs, Transfers);
 }
 
 /// This routine joins the analysis results of all incoming edges in @MBB by
@@ -1024,7 +1138,8 @@ bool LiveDebugValues::join(
     MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
     const VarLocMap &VarLocIDs,
     SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
-    SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks) {
+    SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks,
+    VarLocInMBB &PendingInLocs) {
   LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getNumber() << "\n");
   bool Changed = false;
 
@@ -1034,9 +1149,11 @@ bool LiveDebugValues::join(
   // can be joined.
   int NumVisited = 0;
   for (auto p : MBB.predecessors()) {
-    // Ignore unvisited predecessor blocks.  As we are processing
-    // the blocks in reverse post-order any unvisited block can
-    // be considered to not remove any incoming values.
+    // Ignore backedges if we have not visited the predecessor yet. As the
+    // predecessor hasn't yet had locations propagated into it, most locations
+    // will not yet be valid, so treat them as all being uninitialized and
+    // potentially valid. If a location guessed to be correct here is
+    // invalidated later, we will remove it when we revisit this block.
     if (!Visited.count(p)) {
       LLVM_DEBUG(dbgs() << "  ignoring unvisited pred MBB: " << p->getNumber()
                         << "\n");
@@ -1086,44 +1203,59 @@ bool LiveDebugValues::join(
   // is the entry block which has no predecessor.
   assert((NumVisited || MBB.pred_empty()) &&
          "Should have processed at least one predecessor");
-  if (InLocsT.empty())
-    return false;
 
   VarLocSet &ILS = InLocs[&MBB];
+  VarLocSet &Pending = PendingInLocs[&MBB];
 
-  // Insert DBG_VALUE instructions, if not already inserted.
+  // New locations will have DBG_VALUE insts inserted at the start of the
+  // block, after location propagation has finished. Record the insertions
+  // that we need to perform in the Pending set.
   VarLocSet Diff = InLocsT;
   Diff.intersectWithComplement(ILS);
   for (auto ID : Diff) {
-    // This VarLoc is not found in InLocs i.e. it is not yet inserted. So, a
-    // new range is started for the var from the mbb's beginning by inserting
-    // a new DBG_VALUE. process() will end this range however appropriate.
-    const VarLoc &DiffIt = VarLocIDs[ID];
-    const MachineInstr *DebugInstr = &DiffIt.MI;
-    MachineInstr *MI = nullptr;
-    if (DiffIt.isConstant()) {
-      MachineOperand MO(DebugInstr->getOperand(0));
-      MI = BuildMI(MBB, MBB.instr_begin(), DebugInstr->getDebugLoc(),
-                   DebugInstr->getDesc(), false, MO,
-                   DebugInstr->getDebugVariable(),
-                   DebugInstr->getDebugExpression());
-    } else {
-      MI = BuildMI(MBB, MBB.instr_begin(), DebugInstr->getDebugLoc(),
-                   DebugInstr->getDesc(), DebugInstr->isIndirectDebugValue(),
-                   DebugInstr->getOperand(0).getReg(),
-                   DebugInstr->getDebugVariable(),
-                   DebugInstr->getDebugExpression());
-      if (DebugInstr->isIndirectDebugValue())
-        MI->getOperand(1).setImm(DebugInstr->getOperand(1).getImm());
-    }
-    LLVM_DEBUG(dbgs() << "Inserted: "; MI->dump(););
+    Pending.set(ID);
     ILS.set(ID);
     ++NumInserted;
     Changed = true;
   }
+
+  // We may have lost locations by learning about a predecessor that either
+  // loses or moves a variable. Find any locations in ILS that are not in the
+  // new in-locations, and delete those.
+  VarLocSet Removed = ILS;
+  Removed.intersectWithComplement(InLocsT);
+  for (auto ID : Removed) {
+    Pending.reset(ID);
+    ILS.reset(ID);
+    ++NumRemoved;
+    Changed = true;
+  }
+
   return Changed;
 }
 
+void LiveDebugValues::flushPendingLocs(VarLocInMBB &PendingInLocs,
+                                       VarLocMap &VarLocIDs) {
+  // PendingInLocs records all locations propagated into blocks, which have
+  // not had DBG_VALUE insts created. Go through and create those insts now.
+  for (auto &Iter : PendingInLocs) {
+    // Map is keyed on a constant pointer, unwrap it so we can insert insts.
+    auto &MBB = const_cast<MachineBasicBlock &>(*Iter.first);
+    VarLocSet &Pending = Iter.second;
+
+    for (unsigned ID : Pending) {
+      // The ID location is live-in to MBB -- work out what kind of machine
+      // location it is and create a DBG_VALUE.
+      const VarLoc &DiffIt = VarLocIDs[ID];
+      MachineInstr *MI = DiffIt.BuildDbgValue(*MBB.getParent());
+      MBB.insert(MBB.instr_begin(), MI);
+
+      (void)MI;
+      LLVM_DEBUG(dbgs() << "Inserted: "; MI->dump(););
+    }
+  }
+}
+
 /// Calculate the liveness information for the given machine function and
 /// extend ranges across basic blocks.
 bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
@@ -1140,6 +1272,9 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
   VarLocInMBB OutLocs;        // Ranges that exist beyond bb.
   VarLocInMBB InLocs;         // Ranges that are incoming after joining.
   TransferMap Transfers;      // DBG_VALUEs associated with spills.
+  VarLocInMBB PendingInLocs;  // Ranges that are incoming after joining, but
+                              // that we have deferred creating DBG_VALUE insts
+                              // for immediately.
 
   VarToFragments SeenFragments;
 
@@ -1156,8 +1291,6 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
                       std::greater<unsigned int>>
       Pending;
 
-  enum : bool { dontTransferChanges = false, transferChanges = true };
-
   // Besides parameter's modification, check whether a DBG_VALUE is inlined
   // in order to deduce whether the variable that it tracks comes from
   // a different function. If that is the case we can't track its entry value.
@@ -1169,7 +1302,7 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
 
   const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
   unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
-  unsigned FP = TRI->getFrameRegister(MF);
+  Register FP = TRI->getFrameRegister(MF);
   auto IsRegOtherThanSPAndFP = [&](const MachineOperand &Op) -> bool {
     return Op.isReg() && Op.getReg() != SP && Op.getReg() != FP;
   };
@@ -1195,23 +1328,14 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
         !MI.getDebugExpression()->isFragment())
       DebugEntryVals[MI.getDebugVariable()] = &MI;
 
-  // Initialize every mbb with OutLocs.
-  // We are not looking at any spill instructions during the initial pass
-  // over the BBs. The LiveDebugVariables pass has already created DBG_VALUE
-  // instructions for spills of registers that are known to be user variables
-  // within the BB in which the spill occurs.
+  // Initialize per-block structures and scan for fragment overlaps.
   for (auto &MBB : MF) {
+    PendingInLocs[&MBB] = VarLocSet();
+
     for (auto &MI : MBB) {
-      process(MI, OpenRanges, OutLocs, VarLocIDs, Transfers, DebugEntryVals,
-              dontTransferChanges, OverlapFragments, SeenFragments);
-    }
-    // Add any entry DBG_VALUE instructions necessitated by parameter
-    // clobbering.
-    for (auto &TR : Transfers) {
-      MBB.insertAfter(MachineBasicBlock::iterator(*TR.TransferInst),
-                     TR.DebugInst);
+      if (MI.isDebugValue())
+        accumulateFragmentMap(MI, SeenFragments, OverlapFragments);
     }
-    Transfers.clear();
   }
 
   auto hasNonArtificialLocation = [](const MachineInstr &MI) -> bool {
@@ -1248,26 +1372,21 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
     while (!Worklist.empty()) {
       MachineBasicBlock *MBB = OrderToBB[Worklist.top()];
       Worklist.pop();
-      MBBJoined =
-          join(*MBB, OutLocs, InLocs, VarLocIDs, Visited, ArtificialBlocks);
-      Visited.insert(MBB);
+      MBBJoined = join(*MBB, OutLocs, InLocs, VarLocIDs, Visited,
+                       ArtificialBlocks, PendingInLocs);
+      MBBJoined |= Visited.insert(MBB).second;
       if (MBBJoined) {
         MBBJoined = false;
         Changed = true;
         // Now that we have started to extend ranges across BBs we need to
         // examine spill instructions to see whether they spill registers that
         // correspond to user variables.
+        // First load any pending inlocs.
+        OpenRanges.insertFromLocSet(PendingInLocs[MBB], VarLocIDs);
         for (auto &MI : *MBB)
-          OLChanged |=
               process(MI, OpenRanges, OutLocs, VarLocIDs, Transfers,
-                      DebugEntryVals, transferChanges, OverlapFragments,
-                      SeenFragments);
-
-        // Add any DBG_VALUE instructions necessitated by spills.
-        for (auto &TR : Transfers)
-          MBB->insertAfter(MachineBasicBlock::iterator(*TR.TransferInst),
-                           TR.DebugInst);
-        Transfers.clear();
+                      DebugEntryVals, OverlapFragments, SeenFragments);
+        OLChanged |= transferTerminator(MBB, OpenRanges, OutLocs, VarLocIDs);
 
         LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
                                     "OutLocs after propagating", dbgs()));
@@ -1289,6 +1408,19 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
     assert(Pending.empty() && "Pending should be empty");
   }
 
+  // Add any DBG_VALUE instructions created by location transfers.
+  for (auto &TR : Transfers) {
+    MachineBasicBlock *MBB = TR.TransferInst->getParent();
+    const VarLoc &VL = VarLocIDs[TR.LocationID];
+    MachineInstr *MI = VL.BuildDbgValue(MF);
+    MBB->insertAfterBundle(TR.TransferInst->getIterator(), MI);
+  }
+  Transfers.clear();
+
+  // Deferred inlocs will not have had any DBG_VALUE insts created; do
+  // that now.
+  flushPendingLocs(PendingInLocs, VarLocIDs);
+
   LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "Final OutLocs", dbgs()));
   LLVM_DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs, "Final InLocs", dbgs()));
   return Changed;
@@ -1308,7 +1440,7 @@ bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget().getInstrInfo();
   TFI = MF.getSubtarget().getFrameLowering();
   TFI->determineCalleeSaves(MF, CalleeSavedRegs,
-                            make_unique<RegScavenger>().get());
+                            std::make_unique<RegScavenger>().get());
   LS.initialize(MF);
 
   bool Changed = ExtendRanges(MF);
diff --git a/lib/CodeGen/LiveDebugVariables.cpp b/lib/CodeGen/LiveDebugVariables.cpp
index 656ec7d4bdfd..2dd462fc72b3 100644
--- a/lib/CodeGen/LiveDebugVariables.cpp
+++ b/lib/CodeGen/LiveDebugVariables.cpp
@@ -99,28 +99,27 @@ enum : unsigned { UndefLocNo = ~0U };
 /// usage of the location.
 class DbgValueLocation {
 public:
-  DbgValueLocation(unsigned LocNo, bool WasIndirect)
-      : LocNo(LocNo), WasIndirect(WasIndirect) {
+  DbgValueLocation(unsigned LocNo)
+      : LocNo(LocNo) {
     static_assert(sizeof(*this) == sizeof(unsigned), "bad bitfield packing");
     assert(locNo() == LocNo && "location truncation");
   }
 
-  DbgValueLocation() : LocNo(0), WasIndirect(0) {}
+  DbgValueLocation() : LocNo(0) {}
 
   unsigned locNo() const {
     // Fix up the undef location number, which gets truncated.
     return LocNo == INT_MAX ? UndefLocNo : LocNo;
   }
-  bool wasIndirect() const { return WasIndirect; }
   bool isUndef() const { return locNo() == UndefLocNo; }
 
   DbgValueLocation changeLocNo(unsigned NewLocNo) const {
-    return DbgValueLocation(NewLocNo, WasIndirect);
+    return DbgValueLocation(NewLocNo);
   }
 
   friend inline bool operator==(const DbgValueLocation &LHS,
                                 const DbgValueLocation &RHS) {
-    return LHS.LocNo == RHS.LocNo && LHS.WasIndirect == RHS.WasIndirect;
+    return LHS.LocNo == RHS.LocNo;
   }
 
   friend inline bool operator!=(const DbgValueLocation &LHS,
@@ -129,8 +128,7 @@ public:
   }
 
 private:
-  unsigned LocNo : 31;
-  unsigned WasIndirect : 1;
+  unsigned LocNo;
 };
 
 /// Map of where a user value is live, and its location.
@@ -144,22 +142,51 @@ namespace {
 
 class LDVImpl;
 
+/// A UserValue is uniquely identified by the source variable it refers to
+/// (Variable), the expression describing how to get the value (Expression) and
+/// the specific usage (InlinedAt). InlinedAt differentiates both between
+/// inline and non-inline functions, and multiple inlined instances in the same
+/// scope. FIXME: The only part of the Expression which matters for UserValue
+/// identification is the fragment part.
+class UserValueIdentity {
+private:
+  /// The debug info variable we are part of.
+  const DILocalVariable *Variable;
+  /// Any complex address expression.
+  const DIExpression *Expression;
+  /// Function usage identification.
+  const DILocation *InlinedAt;
+
+public:
+  UserValueIdentity(const DILocalVariable *Var, const DIExpression *Expr,
+                    const DILocation *IA)
+      : Variable(Var), Expression(Expr), InlinedAt(IA) {}
+
+  bool match(const DILocalVariable *Var, const DIExpression *Expr,
+             const DILocation *IA) const {
+    // FIXME: The fragment should be part of the identity, but not
+    // other things in the expression like stack values.
+    return Var == Variable && Expr == Expression && IA == InlinedAt;
+  }
+
+  bool match(const UserValueIdentity &Other) const {
+    return match(Other.Variable, Other.Expression, Other.InlinedAt);
+  }
+
+  unsigned hash_value() const {
+    return hash_combine(Variable, Expression, InlinedAt);
+  }
+};
+
 /// A user value is a part of a debug info user variable.
 ///
 /// A DBG_VALUE instruction notes that (a sub-register of) a virtual register
 /// holds part of a user variable. The part is identified by a byte offset.
-///
-/// UserValues are grouped into equivalence classes for easier searching. Two
-/// user values are related if they refer to the same variable, or if they are
-/// held by the same virtual register. The equivalence class is the transitive
-/// closure of that relation.
 class UserValue {
   const DILocalVariable *Variable; ///< The debug info variable we are part of.
   const DIExpression *Expression; ///< Any complex address expression.
   DebugLoc dl;            ///< The debug location for the variable. This is
                           ///< used by dwarf writer to find lexical scope.
-  UserValue *leader;      ///< Equivalence class leader.
-  UserValue *next = nullptr; ///< Next value in equivalence class, or null.
 
   /// Numbered locations referenced by locmap.
   SmallVector<MachineOperand, 4> locations;
@@ -180,49 +207,15 @@ class UserValue {
                      LiveIntervals &LIS);
 
 public:
+  UserValue(const UserValue &) = delete;
+
   /// Create a new UserValue.
   UserValue(const DILocalVariable *var, const DIExpression *expr, DebugLoc L,
             LocMap::Allocator &alloc)
-      : Variable(var), Expression(expr), dl(std::move(L)), leader(this),
-        locInts(alloc) {}
-
-  /// Get the leader of this value's equivalence class.
-  UserValue *getLeader() {
-    UserValue *l = leader;
-    while (l != l->leader)
-      l = l->leader;
-    return leader = l;
-  }
+      : Variable(var), Expression(expr), dl(std::move(L)), locInts(alloc) {}
 
-  /// Return the next UserValue in the equivalence class.
-  UserValue *getNext() const { return next; }
-
-  /// Does this UserValue match the parameters?
-  bool match(const DILocalVariable *Var, const DIExpression *Expr,
-             const DILocation *IA) const {
-    // FIXME: The fragment should be part of the equivalence class, but not
-    // other things in the expression like stack values.
-    return Var == Variable && Expr == Expression && dl->getInlinedAt() == IA;
-  }
-
-  /// Merge equivalence classes.
-  static UserValue *merge(UserValue *L1, UserValue *L2) {
-    L2 = L2->getLeader();
-    if (!L1)
-      return L2;
-    L1 = L1->getLeader();
-    if (L1 == L2)
-      return L1;
-    // Splice L2 before L1's members.
-    UserValue *End = L2;
-    while (End->next) {
-      End->leader = L1;
-      End = End->next;
-    }
-    End->leader = L1;
-    End->next = L1->next;
-    L1->next = L2;
-    return L1;
+  UserValueIdentity getId() {
+    return UserValueIdentity(Variable, Expression, dl->getInlinedAt());
   }
 
   /// Return the location number that matches Loc.
@@ -261,8 +254,8 @@ public:
   void mapVirtRegs(LDVImpl *LDV);
 
   /// Add a definition point to this value.
-  void addDef(SlotIndex Idx, const MachineOperand &LocMO, bool IsIndirect) {
-    DbgValueLocation Loc(getLocationNo(LocMO), IsIndirect);
+  void addDef(SlotIndex Idx, const MachineOperand &LocMO) {
+    DbgValueLocation Loc(getLocationNo(LocMO));
     // Add a singular (Idx,Idx) -> Loc mapping.
     LocMap::iterator I = locInts.find(Idx);
     if (!I.valid() || I.start() != Idx)
@@ -297,11 +290,10 @@ public:
   ///
   /// \param LI Scan for copies of the value in LI->reg.
   /// \param LocNo Location number of LI->reg.
-  /// \param WasIndirect Indicates if the original use of LI->reg was indirect
   /// \param Kills Points where the range of LocNo could be extended.
   /// \param [in,out] NewDefs Append (Idx, LocNo) of inserted defs here.
   void addDefsFromCopies(
-      LiveInterval *LI, unsigned LocNo, bool WasIndirect,
+      LiveInterval *LI, unsigned LocNo,
       const SmallVectorImpl<SlotIndex> &Kills,
       SmallVectorImpl<std::pair<SlotIndex, DbgValueLocation>> &NewDefs,
       MachineRegisterInfo &MRI, LiveIntervals &LIS);
@@ -335,7 +327,29 @@ public:
 
   void print(raw_ostream &, const TargetRegisterInfo *);
 };
+} // namespace
 
+namespace llvm {
+template <> struct DenseMapInfo<UserValueIdentity> {
+  static UserValueIdentity getEmptyKey() {
+    auto Key = DenseMapInfo<DILocalVariable *>::getEmptyKey();
+    return UserValueIdentity(Key, nullptr, nullptr);
+  }
+  static UserValueIdentity getTombstoneKey() {
+    auto Key = DenseMapInfo<DILocalVariable *>::getTombstoneKey();
+    return UserValueIdentity(Key, nullptr, nullptr);
+  }
+  static unsigned getHashValue(const UserValueIdentity &Val) {
+    return Val.hash_value();
+  }
+  static bool isEqual(const UserValueIdentity &LHS,
+                      const UserValueIdentity &RHS) {
+    return LHS.match(RHS);
+  }
+};
+} // namespace llvm
+
+namespace {
 /// A user label is a part of a debug info user label.
 class UserLabel {
   const DILabel *Label; ///< The debug info label we are part of.
@@ -387,20 +401,20 @@ class LDVImpl {
   /// All allocated UserLabel instances.
   SmallVector<std::unique_ptr<UserLabel>, 2> userLabels;
 
-  /// Map virtual register to eq class leader.
-  using VRMap = DenseMap<unsigned, UserValue *>;
-  VRMap virtRegToEqClass;
+  /// Map virtual register to UserValues which use it.
+  using VRMap = DenseMap<unsigned, SmallVector<UserValue *, 4>>;
+  VRMap VirtRegToUserVals;
 
-  /// Map user variable to eq class leader.
-  using UVMap = DenseMap<const DILocalVariable *, UserValue *>;
-  UVMap userVarMap;
+  /// Map unique UserValue identity to UserValue.
+  using UVMap = DenseMap<UserValueIdentity, UserValue *>;
+  UVMap UserVarMap;
 
   /// Find or create a UserValue.
   UserValue *getUserValue(const DILocalVariable *Var, const DIExpression *Expr,
                           const DebugLoc &DL);
 
-  /// Find the EC leader for VirtReg or null.
-  UserValue *lookupVirtReg(unsigned VirtReg);
+  /// Find the UserValues for VirtReg or null.
+  SmallVectorImpl<UserValue *> *lookupVirtReg(unsigned VirtReg);
 
   /// Add DBG_VALUE instruction to our maps.
   ///
@@ -440,8 +454,8 @@ public:
     MF = nullptr;
     userValues.clear();
     userLabels.clear();
-    virtRegToEqClass.clear();
-    userVarMap.clear();
+    VirtRegToUserVals.clear();
+    UserVarMap.clear();
     // Make sure we call emitDebugValues if the machine function was modified.
     assert((!ModifiedMF || EmitDone) &&
            "Dbg values are not emitted in LDV");
@@ -449,8 +463,8 @@ public:
     ModifiedMF = false;
   }
 
-  /// Map virtual register to an equivalence class.
-  void mapVirtReg(unsigned VirtReg, UserValue *EC);
+  /// Map virtual register to a UserValue.
+  void mapVirtReg(unsigned VirtReg, UserValue *UV);
 
   /// Replace all references to OldReg with NewRegs.
   void splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs);
@@ -521,8 +535,6 @@ void UserValue::print(raw_ostream &OS, const TargetRegisterInfo *TRI) {
       OS << "undef";
     else {
       OS << I.value().locNo();
-      if (I.value().wasIndirect())
-        OS << " ind";
     }
   }
   for (unsigned i = 0, e = locations.size(); i != e; ++i) {
@@ -554,37 +566,33 @@ void LDVImpl::print(raw_ostream &OS) {
 void UserValue::mapVirtRegs(LDVImpl *LDV) {
   for (unsigned i = 0, e = locations.size(); i != e; ++i)
     if (locations[i].isReg() &&
-        TargetRegisterInfo::isVirtualRegister(locations[i].getReg()))
+        Register::isVirtualRegister(locations[i].getReg()))
       LDV->mapVirtReg(locations[i].getReg(), this);
 }
 
 UserValue *LDVImpl::getUserValue(const DILocalVariable *Var,
                                  const DIExpression *Expr, const DebugLoc &DL) {
-  UserValue *&Leader = userVarMap[Var];
-  if (Leader) {
-    UserValue *UV = Leader->getLeader();
-    Leader = UV;
-    for (; UV; UV = UV->getNext())
-      if (UV->match(Var, Expr, DL->getInlinedAt()))
-        return UV;
-  }
+  auto Ident = UserValueIdentity(Var, Expr, DL->getInlinedAt());
+  UserValue *&UVEntry = UserVarMap[Ident];
 
-  userValues.push_back(
-      llvm::make_unique<UserValue>(Var, Expr, DL, allocator));
-  UserValue *UV = userValues.back().get();
-  Leader = UserValue::merge(Leader, UV);
-  return UV;
+  if (UVEntry)
+    return UVEntry;
+
+  userValues.push_back(std::make_unique<UserValue>(Var, Expr, DL, allocator));
+  return UVEntry = userValues.back().get();
 }
 
-void LDVImpl::mapVirtReg(unsigned VirtReg, UserValue *EC) {
-  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) && "Only map VirtRegs");
-  UserValue *&Leader = virtRegToEqClass[VirtReg];
-  Leader = UserValue::merge(Leader, EC);
+void LDVImpl::mapVirtReg(unsigned VirtReg, UserValue *UV) {
+  assert(Register::isVirtualRegister(VirtReg) && "Only map VirtRegs");
+  assert(UserVarMap.find(UV->getId()) != UserVarMap.end() &&
+         "UserValue should exist in UserVarMap");
+  VirtRegToUserVals[VirtReg].push_back(UV);
 }
 
-UserValue *LDVImpl::lookupVirtReg(unsigned VirtReg) {
-  if (UserValue *UV = virtRegToEqClass.lookup(VirtReg))
-    return UV->getLeader();
+SmallVectorImpl<UserValue *> *LDVImpl::lookupVirtReg(unsigned VirtReg) {
+  VRMap::iterator Itr = VirtRegToUserVals.find(VirtReg);
+  if (Itr != VirtRegToUserVals.end())
+    return &Itr->getSecond();
   return nullptr;
 }
 
@@ -606,8 +614,8 @@ bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
   // could be removed or replaced by asserts.
   bool Discard = false;
   if (MI.getOperand(0).isReg() &&
-      TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg())) {
-    const unsigned Reg = MI.getOperand(0).getReg();
+      Register::isVirtualRegister(MI.getOperand(0).getReg())) {
+    const Register Reg = MI.getOperand(0).getReg();
     if (!LIS->hasInterval(Reg)) {
       // The DBG_VALUE is described by a virtual register that does not have a
       // live interval. Discard the DBG_VALUE.
@@ -631,19 +639,18 @@ bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
   }
 
   // Get or create the UserValue for (variable,offset) here.
-  bool IsIndirect = MI.getOperand(1).isImm();
-  if (IsIndirect)
-    assert(MI.getOperand(1).getImm() == 0 && "DBG_VALUE with nonzero offset");
+  assert(!MI.getOperand(1).isImm() && "DBG_VALUE with indirect flag before "
+                                      "LiveDebugVariables");
   const DILocalVariable *Var = MI.getDebugVariable();
   const DIExpression *Expr = MI.getDebugExpression();
   UserValue *UV =
       getUserValue(Var, Expr, MI.getDebugLoc());
   if (!Discard)
-    UV->addDef(Idx, MI.getOperand(0), IsIndirect);
+    UV->addDef(Idx, MI.getOperand(0));
   else {
     MachineOperand MO = MachineOperand::CreateReg(0U, false);
     MO.setIsDebug();
-    UV->addDef(Idx, MO, false);
+    UV->addDef(Idx, MO);
   }
   return true;
 }
@@ -666,7 +673,7 @@ bool LDVImpl::handleDebugLabel(MachineInstr &MI, SlotIndex Idx) {
     }
   }
   if (!Found)
-    userLabels.push_back(llvm::make_unique<UserLabel>(Label, DL, Idx));
+    userLabels.push_back(std::make_unique<UserLabel>(Label, DL, Idx));
 
   return true;
 }
@@ -751,14 +758,14 @@ void UserValue::extendDef(SlotIndex Idx, DbgValueLocation Loc, LiveRange *LR,
 }
 
 void UserValue::addDefsFromCopies(
-    LiveInterval *LI, unsigned LocNo, bool WasIndirect,
+    LiveInterval *LI, unsigned LocNo,
     const SmallVectorImpl<SlotIndex> &Kills,
     SmallVectorImpl<std::pair<SlotIndex, DbgValueLocation>> &NewDefs,
     MachineRegisterInfo &MRI, LiveIntervals &LIS) {
   if (Kills.empty())
     return;
   // Don't track copies from physregs, there are too many uses.
-  if (!TargetRegisterInfo::isVirtualRegister(LI->reg))
+  if (!Register::isVirtualRegister(LI->reg))
     return;
 
   // Collect all the (vreg, valno) pairs that are copies of LI.
@@ -768,13 +775,13 @@ void UserValue::addDefsFromCopies(
     // Copies of the full value.
     if (MO.getSubReg() || !MI->isCopy())
       continue;
-    unsigned DstReg = MI->getOperand(0).getReg();
+    Register DstReg = MI->getOperand(0).getReg();
 
     // Don't follow copies to physregs. These are usually setting up call
     // arguments, and the argument registers are always call clobbered. We are
     // better off in the source register which could be a callee-saved register,
     // or it could be spilled.
-    if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+    if (!Register::isVirtualRegister(DstReg))
       continue;
 
     // Is LocNo extended to reach this copy? If not, another def may be blocking
@@ -815,7 +822,7 @@ void UserValue::addDefsFromCopies(
       MachineInstr *CopyMI = LIS.getInstructionFromIndex(DstVNI->def);
       assert(CopyMI && CopyMI->isCopy() && "Bad copy value");
       unsigned LocNo = getLocationNo(CopyMI->getOperand(0));
-      DbgValueLocation NewLoc(LocNo, WasIndirect);
+      DbgValueLocation NewLoc(LocNo);
       I.insert(Idx, Idx.getNextSlot(), NewLoc);
       NewDefs.push_back(std::make_pair(Idx, NewLoc));
       break;
@@ -845,7 +852,7 @@ void UserValue::computeIntervals(MachineRegisterInfo &MRI,
     }
 
     // Register locations are constrained to where the register value is live.
-    if (TargetRegisterInfo::isVirtualRegister(LocMO.getReg())) {
+    if (Register::isVirtualRegister(LocMO.getReg())) {
       LiveInterval *LI = nullptr;
       const VNInfo *VNI = nullptr;
       if (LIS.hasInterval(LocMO.getReg())) {
@@ -863,8 +870,7 @@ void UserValue::computeIntervals(MachineRegisterInfo &MRI,
       // sub-register in that regclass). For now, simply skip handling copies if
       // a sub-register is involved.
       if (LI && !LocMO.getSubReg())
-        addDefsFromCopies(LI, Loc.locNo(), Loc.wasIndirect(), Kills, Defs, MRI,
-                          LIS);
+        addDefsFromCopies(LI, Loc.locNo(), Kills, Defs, MRI, LIS);
       continue;
     }
 
@@ -1123,16 +1129,18 @@ UserValue::splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs,
 
 void LDVImpl::splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs) {
   bool DidChange = false;
-  for (UserValue *UV = lookupVirtReg(OldReg); UV; UV = UV->getNext())
-    DidChange |= UV->splitRegister(OldReg, NewRegs, *LIS);
+  if (auto *UserVals = lookupVirtReg(OldReg))
+    for (auto *UV : *UserVals)
+      DidChange |= UV->splitRegister(OldReg, NewRegs, *LIS);
 
   if (!DidChange)
     return;
 
   // Map all of the new virtual registers.
-  UserValue *UV = lookupVirtReg(OldReg);
-  for (unsigned i = 0; i != NewRegs.size(); ++i)
-    mapVirtReg(NewRegs[i], UV);
+  if (auto *UserVals = lookupVirtReg(OldReg))
+    for (auto *UV : *UserVals)
+      for (unsigned i = 0; i != NewRegs.size(); ++i)
+        mapVirtReg(NewRegs[i], UV);
 }
 
 void LiveDebugVariables::
@@ -1161,10 +1169,10 @@ void UserValue::rewriteLocations(VirtRegMap &VRM, const MachineFunction &MF,
     MachineOperand Loc = locations[I];
     // Only virtual registers are rewritten.
     if (Loc.isReg() && Loc.getReg() &&
-        TargetRegisterInfo::isVirtualRegister(Loc.getReg())) {
-      unsigned VirtReg = Loc.getReg();
+        Register::isVirtualRegister(Loc.getReg())) {
+      Register VirtReg = Loc.getReg();
       if (VRM.isAssignedReg(VirtReg) &&
-          TargetRegisterInfo::isPhysicalRegister(VRM.getPhys(VirtReg))) {
+          Register::isPhysicalRegister(VRM.getPhys(VirtReg))) {
         // This can create a %noreg operand in rare cases when the sub-register
         // index is no longer available. That means the user value is in a
         // non-existent sub-register, and %noreg is exactly what we want.
@@ -1258,7 +1266,7 @@ findNextInsertLocation(MachineBasicBlock *MBB,
                        const TargetRegisterInfo &TRI) {
   if (!LocMO.isReg())
     return MBB->instr_end();
-  unsigned Reg = LocMO.getReg();
+  Register Reg = LocMO.getReg();
 
   // Find the next instruction in the MBB that define the register Reg.
   while (I != MBB->end() && !I->isTerminator()) {
@@ -1302,21 +1310,14 @@ void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex StartIdx,
   // that the original virtual register was a pointer. Also, add the stack slot
   // offset for the spilled register to the expression.
   const DIExpression *Expr = Expression;
-  uint8_t DIExprFlags = DIExpression::ApplyOffset;
-  bool IsIndirect = Loc.wasIndirect();
-  if (Spilled) {
-    if (IsIndirect)
-      DIExprFlags |= DIExpression::DerefAfter;
-    Expr =
-        DIExpression::prepend(Expr, DIExprFlags, SpillOffset);
-    IsIndirect = true;
-  }
+  if (Spilled)
+    Expr = DIExpression::prepend(Expr, DIExpression::ApplyOffset, SpillOffset);
 
   assert((!Spilled || MO.isFI()) && "a spilled location must be a frame index");
 
   do {
     BuildMI(*MBB, I, getDebugLoc(), TII.get(TargetOpcode::DBG_VALUE),
-            IsIndirect, MO, Variable, Expr);
+            Spilled, MO, Variable, Expr);
 
     // Continue and insert DBG_VALUES after every redefinition of register
     // associated with the debug value within the range
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index 70b2a77fe800..54ac46f2e7ce 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -886,7 +886,7 @@ static void stripValuesNotDefiningMask(unsigned Reg, LiveInterval::SubRange &SR,
                                        const TargetRegisterInfo &TRI) {
   // Phys reg should not be tracked at subreg level.
   // Same for noreg (Reg == 0).
-  if (!TargetRegisterInfo::isVirtualRegister(Reg) || !Reg)
+  if (!Register::isVirtualRegister(Reg) || !Reg)
     return;
   // Remove the values that don't define those lanes.
   SmallVector<VNInfo *, 8> ToBeRemoved;
@@ -917,7 +917,8 @@ static void stripValuesNotDefiningMask(unsigned Reg, LiveInterval::SubRange &SR,
   for (VNInfo *VNI : ToBeRemoved)
     SR.removeValNo(VNI);
 
-  assert(!SR.empty() && "At least one value should be defined by this mask");
+  // If the subrange is empty at this point, the MIR is invalid. Do not assert
+  // and let the verifier catch this case.
 }
 
 void LiveInterval::refineSubRanges(
@@ -967,7 +968,7 @@ void LiveInterval::computeSubRangeUndefs(SmallVectorImpl<SlotIndex> &Undefs,
                                          LaneBitmask LaneMask,
                                          const MachineRegisterInfo &MRI,
                                          const SlotIndexes &Indexes) const {
-  assert(TargetRegisterInfo::isVirtualRegister(reg));
+  assert(Register::isVirtualRegister(reg));
   LaneBitmask VRegMask = MRI.getMaxLaneMaskForVReg(reg);
   assert((VRegMask & LaneMask).any());
   const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
diff --git a/lib/CodeGen/LiveIntervals.cpp b/lib/CodeGen/LiveIntervals.cpp
index aa85569063b3..2989930ad093 100644
--- a/lib/CodeGen/LiveIntervals.cpp
+++ b/lib/CodeGen/LiveIntervals.cpp
@@ -14,7 +14,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/LiveIntervals.h"
-#include "LiveRangeCalc.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -22,6 +21,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveRangeCalc.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
@@ -108,7 +108,7 @@ LiveIntervals::~LiveIntervals() {
 void LiveIntervals::releaseMemory() {
   // Free the live intervals themselves.
   for (unsigned i = 0, e = VirtRegIntervals.size(); i != e; ++i)
-    delete VirtRegIntervals[TargetRegisterInfo::index2VirtReg(i)];
+    delete VirtRegIntervals[Register::index2VirtReg(i)];
   VirtRegIntervals.clear();
   RegMaskSlots.clear();
   RegMaskBits.clear();
@@ -161,7 +161,7 @@ void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
 
   // Dump the virtregs.
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     if (hasInterval(Reg))
       OS << getInterval(Reg) << '\n';
   }
@@ -186,7 +186,7 @@ LLVM_DUMP_METHOD void LiveIntervals::dumpInstrs() const {
 #endif
 
 LiveInterval* LiveIntervals::createInterval(unsigned reg) {
-  float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ? huge_valf : 0.0F;
+  float Weight = Register::isPhysicalRegister(reg) ? huge_valf : 0.0F;
   return new LiveInterval(reg, Weight);
 }
 
@@ -201,7 +201,7 @@ void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {
 
 void LiveIntervals::computeVirtRegs() {
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     if (MRI->reg_nodbg_empty(Reg))
       continue;
     createAndComputeVirtRegInterval(Reg);
@@ -441,8 +441,8 @@ void LiveIntervals::extendSegmentsToUses(LiveRange &Segments,
 bool LiveIntervals::shrinkToUses(LiveInterval *li,
                                  SmallVectorImpl<MachineInstr*> *dead) {
   LLVM_DEBUG(dbgs() << "Shrink: " << *li << '\n');
-  assert(TargetRegisterInfo::isVirtualRegister(li->reg)
-         && "Can only shrink virtual registers");
+  assert(Register::isVirtualRegister(li->reg) &&
+         "Can only shrink virtual registers");
 
   // Shrink subregister live ranges.
   bool NeedsCleanup = false;
@@ -541,8 +541,8 @@ bool LiveIntervals::computeDeadValues(LiveInterval &LI,
 
 void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg) {
   LLVM_DEBUG(dbgs() << "Shrink: " << SR << '\n');
-  assert(TargetRegisterInfo::isVirtualRegister(Reg)
-         && "Can only shrink virtual registers");
+  assert(Register::isVirtualRegister(Reg) &&
+         "Can only shrink virtual registers");
   // Find all the values used, including PHI kills.
   ShrinkToUsesWorkList WorkList;
 
@@ -688,7 +688,7 @@ void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
                         LiveRange::const_iterator>, 4> SRs;
 
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     if (MRI->reg_nodbg_empty(Reg))
       continue;
     const LiveInterval &LI = getInterval(Reg);
@@ -986,10 +986,10 @@ public:
         MO.setIsKill(false);
       }
 
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg)
         continue;
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         LiveInterval &LI = LIS.getInterval(Reg);
         if (LI.hasSubRanges()) {
           unsigned SubReg = MO.getSubReg();
@@ -1023,7 +1023,7 @@ private:
       return;
     LLVM_DEBUG({
       dbgs() << "     ";
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         dbgs() << printReg(Reg);
         if (LaneMask.any())
           dbgs() << " L" << PrintLaneMask(LaneMask);
@@ -1288,6 +1288,20 @@ private:
           const SlotIndex SplitPos = NewIdxDef;
           OldIdxVNI = OldIdxIn->valno;
 
+          SlotIndex NewDefEndPoint = std::next(NewIdxIn)->end;
+          LiveRange::iterator Prev = std::prev(OldIdxIn);
+          if (OldIdxIn != LR.begin() &&
+              SlotIndex::isEarlierInstr(NewIdx, Prev->end)) {
+            // If the segment before OldIdx read a value defined earlier than
+            // NewIdx, the moved instruction also reads and forwards that
+            // value. Extend the lifetime of the new def point.
+
+            // Extend to where the previous range started, unless there is
+            // another redef first.
+            NewDefEndPoint = std::min(OldIdxIn->start,
+                                      std::next(NewIdxOut)->start);
+          }
+
           // Merge the OldIdxIn and OldIdxOut segments into OldIdxOut.
           OldIdxOut->valno->def = OldIdxIn->start;
           *OldIdxOut = LiveRange::Segment(OldIdxIn->start, OldIdxOut->end,
@@ -1305,7 +1319,8 @@ private:
             // There is no gap between NewSegment and its predecessor.
             *NewSegment = LiveRange::Segment(Next->start, SplitPos,
                                              Next->valno);
-            *Next = LiveRange::Segment(SplitPos, Next->end, OldIdxVNI);
+
+            *Next = LiveRange::Segment(SplitPos, NewDefEndPoint, OldIdxVNI);
             Next->valno->def = SplitPos;
           } else {
             // There is a gap between NewSegment and its predecessor
@@ -1384,7 +1399,7 @@ private:
   // Return the last use of reg between NewIdx and OldIdx.
   SlotIndex findLastUseBefore(SlotIndex Before, unsigned Reg,
                               LaneBitmask LaneMask) {
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (Register::isVirtualRegister(Reg)) {
       SlotIndex LastUse = Before;
       for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
         if (MO.isUndef())
@@ -1429,7 +1444,7 @@ private:
       // Check if MII uses Reg.
       for (MIBundleOperands MO(*MII); MO.isValid(); ++MO)
         if (MO->isReg() && !MO->isUndef() &&
-            TargetRegisterInfo::isPhysicalRegister(MO->getReg()) &&
+            Register::isPhysicalRegister(MO->getReg()) &&
             TRI.hasRegUnit(MO->getReg(), Reg))
           return Idx.getRegSlot();
     }
@@ -1439,7 +1454,10 @@ private:
 };
 
 void LiveIntervals::handleMove(MachineInstr &MI, bool UpdateFlags) {
-  assert(!MI.isBundled() && "Can't handle bundled instructions yet.");
+  // It is fine to move a bundle as a whole, but not an individual instruction
+  // inside it.
+  assert((!MI.isBundled() || MI.getOpcode() == TargetOpcode::BUNDLE) &&
+         "Cannot move instruction in bundle");
   SlotIndex OldIndex = Indexes->getInstructionIndex(MI);
   Indexes->removeMachineInstrFromMaps(MI);
   SlotIndex NewIndex = Indexes->insertMachineInstrInMaps(MI);
@@ -1582,8 +1600,7 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
     for (MachineInstr::const_mop_iterator MOI = MI.operands_begin(),
                                           MOE = MI.operands_end();
          MOI != MOE; ++MOI) {
-      if (MOI->isReg() &&
-          TargetRegisterInfo::isVirtualRegister(MOI->getReg()) &&
+      if (MOI->isReg() && Register::isVirtualRegister(MOI->getReg()) &&
           !hasInterval(MOI->getReg())) {
         createAndComputeVirtRegInterval(MOI->getReg());
       }
@@ -1591,7 +1608,7 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
   }
 
   for (unsigned Reg : OrigRegs) {
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    if (!Register::isVirtualRegister(Reg))
       continue;
 
     LiveInterval &LI = getInterval(Reg);
@@ -1642,7 +1659,7 @@ void LiveIntervals::splitSeparateComponents(LiveInterval &LI,
   unsigned Reg = LI.reg;
   const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
   for (unsigned I = 1; I < NumComp; ++I) {
-    unsigned NewVReg = MRI->createVirtualRegister(RegClass);
+    Register NewVReg = MRI->createVirtualRegister(RegClass);
     LiveInterval &NewLI = createEmptyInterval(NewVReg);
     SplitLIs.push_back(&NewLI);
   }
diff --git a/lib/CodeGen/LivePhysRegs.cpp b/lib/CodeGen/LivePhysRegs.cpp
index cd3d248ac878..c2a1cc7c6490 100644
--- a/lib/CodeGen/LivePhysRegs.cpp
+++ b/lib/CodeGen/LivePhysRegs.cpp
@@ -46,8 +46,8 @@ void LivePhysRegs::removeDefs(const MachineInstr &MI) {
     if (O->isReg()) {
       if (!O->isDef() || O->isDebug())
         continue;
-      unsigned Reg = O->getReg();
-      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      Register Reg = O->getReg();
+      if (!Register::isPhysicalRegister(Reg))
         continue;
       removeReg(Reg);
     } else if (O->isRegMask())
@@ -60,8 +60,8 @@ void LivePhysRegs::addUses(const MachineInstr &MI) {
   for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
     if (!O->isReg() || !O->readsReg() || O->isDebug())
       continue;
-    unsigned Reg = O->getReg();
-    if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+    Register Reg = O->getReg();
+    if (!Register::isPhysicalRegister(Reg))
       continue;
     addReg(Reg);
   }
@@ -86,8 +86,8 @@ void LivePhysRegs::stepForward(const MachineInstr &MI,
   // Remove killed registers from the set.
   for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
     if (O->isReg() && !O->isDebug()) {
-      unsigned Reg = O->getReg();
-      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      Register Reg = O->getReg();
+      if (!Register::isPhysicalRegister(Reg))
         continue;
       if (O->isDef()) {
         // Note, dead defs are still recorded.  The caller should decide how to
@@ -292,10 +292,10 @@ void llvm::recomputeLivenessFlags(MachineBasicBlock &MBB) {
       if (!MO->isReg() || !MO->isDef() || MO->isDebug())
         continue;
 
-      unsigned Reg = MO->getReg();
+      Register Reg = MO->getReg();
       if (Reg == 0)
         continue;
-      assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+      assert(Register::isPhysicalRegister(Reg));
 
       bool IsNotLive = LiveRegs.available(MRI, Reg);
       MO->setIsDead(IsNotLive);
@@ -309,10 +309,10 @@ void llvm::recomputeLivenessFlags(MachineBasicBlock &MBB) {
       if (!MO->isReg() || !MO->readsReg() || MO->isDebug())
         continue;
 
-      unsigned Reg = MO->getReg();
+      Register Reg = MO->getReg();
       if (Reg == 0)
         continue;
-      assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+      assert(Register::isPhysicalRegister(Reg));
 
       bool IsNotLive = LiveRegs.available(MRI, Reg);
       MO->setIsKill(IsNotLive);
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index d670f28df6ba..24b57be0da00 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "LiveRangeCalc.h"
+#include "llvm/CodeGen/LiveRangeCalc.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -372,8 +372,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
       report_fatal_error("Use not jointly dominated by defs.");
     }
 
-    if (TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
-        !MBB->isLiveIn(PhysReg)) {
+    if (Register::isPhysicalRegister(PhysReg) && !MBB->isLiveIn(PhysReg)) {
       MBB->getParent()->verify();
       const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
       errs() << "The register " << printReg(PhysReg, TRI)
diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h
deleted file mode 100644
index 11aea5a3b016..000000000000
--- a/lib/CodeGen/LiveRangeCalc.h
+++ /dev/null
@@ -1,297 +0,0 @@
-//===- LiveRangeCalc.h - Calculate live ranges ------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The LiveRangeCalc class can be used to compute live ranges from scratch.  It
-// caches information about values in the CFG to speed up repeated operations
-// on the same live range.  The cache can be shared by non-overlapping live
-// ranges.  SplitKit uses that when computing the live range of split products.
-//
-// A low-level interface is available to clients that know where a variable is
-// live, but don't know which value it has as every point.  LiveRangeCalc will
-// propagate values down the dominator tree, and even insert PHI-defs where
-// needed.  SplitKit uses this faster interface when possible.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_CODEGEN_LIVERANGECALC_H
-#define LLVM_LIB_CODEGEN_LIVERANGECALC_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/IndexedMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/SlotIndexes.h"
-#include "llvm/MC/LaneBitmask.h"
-#include <utility>
-
-namespace llvm {
-
-template <class NodeT> class DomTreeNodeBase;
-class MachineDominatorTree;
-class MachineFunction;
-class MachineRegisterInfo;
-
-using MachineDomTreeNode = DomTreeNodeBase<MachineBasicBlock>;
-
-class LiveRangeCalc {
-  const MachineFunction *MF = nullptr;
-  const MachineRegisterInfo *MRI = nullptr;
-  SlotIndexes *Indexes = nullptr;
-  MachineDominatorTree *DomTree = nullptr;
-  VNInfo::Allocator *Alloc = nullptr;
-
-  /// LiveOutPair - A value and the block that defined it.  The domtree node is
-  /// redundant, it can be computed as: MDT[Indexes.getMBBFromIndex(VNI->def)].
-  using LiveOutPair = std::pair<VNInfo *, MachineDomTreeNode *>;
-
-  /// LiveOutMap - Map basic blocks to the value leaving the block.
-  using LiveOutMap = IndexedMap<LiveOutPair, MBB2NumberFunctor>;
-
-  /// Bit vector of active entries in LiveOut, also used as a visited set by
-  /// findReachingDefs.  One entry per basic block, indexed by block number.
-  /// This is kept as a separate bit vector because it can be cleared quickly
-  /// when switching live ranges.
-  BitVector Seen;
-
-  /// Map LiveRange to sets of blocks (represented by bit vectors) that
-  /// in the live range are defined on entry and undefined on entry.
-  /// A block is defined on entry if there is a path from at least one of
-  /// the defs in the live range to the entry of the block, and conversely,
-  /// a block is undefined on entry, if there is no such path (i.e. no
-  /// definition reaches the entry of the block). A single LiveRangeCalc
-  /// object is used to track live-out information for multiple registers
-  /// in live range splitting (which is ok, since the live ranges of these
-  /// registers do not overlap), but the defined/undefined information must
-  /// be kept separate for each individual range.
-  /// By convention, EntryInfoMap[&LR] = { Defined, Undefined }.
-  using EntryInfoMap = DenseMap<LiveRange *, std::pair<BitVector, BitVector>>;
-  EntryInfoMap EntryInfos;
-
-  /// Map each basic block where a live range is live out to the live-out value
-  /// and its defining block.
-  ///
-  /// For every basic block, MBB, one of these conditions shall be true:
-  ///
-  ///  1. !Seen.count(MBB->getNumber())
-  ///     Blocks without a Seen bit are ignored.
-  ///  2. LiveOut[MBB].second.getNode() == MBB
-  ///     The live-out value is defined in MBB.
-  ///  3. forall P in preds(MBB): LiveOut[P] == LiveOut[MBB]
-  ///     The live-out value passses through MBB. All predecessors must carry
-  ///     the same value.
-  ///
-  /// The domtree node may be null, it can be computed.
-  ///
-  /// The map can be shared by multiple live ranges as long as no two are
-  /// live-out of the same block.
-  LiveOutMap Map;
-
-  /// LiveInBlock - Information about a basic block where a live range is known
-  /// to be live-in, but the value has not yet been determined.
-  struct LiveInBlock {
-    // The live range set that is live-in to this block.  The algorithms can
-    // handle multiple non-overlapping live ranges simultaneously.
-    LiveRange &LR;
-
-    // DomNode - Dominator tree node for the block.
-    // Cleared when the final value has been determined and LI has been updated.
-    MachineDomTreeNode *DomNode;
-
-    // Position in block where the live-in range ends, or SlotIndex() if the
-    // range passes through the block.  When the final value has been
-    // determined, the range from the block start to Kill will be added to LI.
-    SlotIndex Kill;
-
-    // Live-in value filled in by updateSSA once it is known.
-    VNInfo *Value = nullptr;
-
-    LiveInBlock(LiveRange &LR, MachineDomTreeNode *node, SlotIndex kill)
-      : LR(LR), DomNode(node), Kill(kill) {}
-  };
-
-  /// LiveIn - Work list of blocks where the live-in value has yet to be
-  /// determined.  This list is typically computed by findReachingDefs() and
-  /// used as a work list by updateSSA().  The low-level interface may also be
-  /// used to add entries directly.
-  SmallVector<LiveInBlock, 16> LiveIn;
-
-  /// Check if the entry to block @p MBB can be reached by any of the defs
-  /// in @p LR. Return true if none of the defs reach the entry to @p MBB.
-  bool isDefOnEntry(LiveRange &LR, ArrayRef<SlotIndex> Undefs,
-                    MachineBasicBlock &MBB, BitVector &DefOnEntry,
-                    BitVector &UndefOnEntry);
-
-  /// Find the set of defs that can reach @p Kill. @p Kill must belong to
-  /// @p UseMBB.
-  ///
-  /// If exactly one def can reach @p UseMBB, and the def dominates @p Kill,
-  /// all paths from the def to @p UseMBB are added to @p LR, and the function
-  /// returns true.
-  ///
-  /// If multiple values can reach @p UseMBB, the blocks that need @p LR to be
-  /// live in are added to the LiveIn array, and the function returns false.
-  ///
-  /// The array @p Undef provides the locations where the range @p LR becomes
-  /// undefined by <def,read-undef> operands on other subranges. If @p Undef
-  /// is non-empty and @p Kill is jointly dominated only by the entries of
-  /// @p Undef, the function returns false.
-  ///
-  /// PhysReg, when set, is used to verify live-in lists on basic blocks.
-  bool findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
-                        SlotIndex Use, unsigned PhysReg,
-                        ArrayRef<SlotIndex> Undefs);
-
-  /// updateSSA - Compute the values that will be live in to all requested
-  /// blocks in LiveIn.  Create PHI-def values as required to preserve SSA form.
-  ///
-  /// Every live-in block must be jointly dominated by the added live-out
-  /// blocks.  No values are read from the live ranges.
-  void updateSSA();
-
-  /// Transfer information from the LiveIn vector to the live ranges and update
-  /// the given @p LiveOuts.
-  void updateFromLiveIns();
-
-  /// Extend the live range of @p LR to reach all uses of Reg.
-  ///
-  /// If @p LR is a main range, or if @p LI is null, then all uses must be
-  /// jointly dominated by the definitions from @p LR. If @p LR is a subrange
-  /// of the live interval @p LI, corresponding to lane mask @p LaneMask,
-  /// all uses must be jointly dominated by the definitions from @p LR
-  /// together with definitions of other lanes where @p LR becomes undefined
-  /// (via <def,read-undef> operands).
-  /// If @p LR is a main range, the @p LaneMask should be set to ~0, i.e.
-  /// LaneBitmask::getAll().
-  void extendToUses(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask,
-                    LiveInterval *LI = nullptr);
-
-  /// Reset Map and Seen fields.
-  void resetLiveOutMap();
-
-public:
-  LiveRangeCalc() = default;
-
-  //===--------------------------------------------------------------------===//
-  // High-level interface.
-  //===--------------------------------------------------------------------===//
-  //
-  // Calculate live ranges from scratch.
-  //
-
-  /// reset - Prepare caches for a new set of non-overlapping live ranges.  The
-  /// caches must be reset before attempting calculations with a live range
-  /// that may overlap a previously computed live range, and before the first
-  /// live range in a function.  If live ranges are not known to be
-  /// non-overlapping, call reset before each.
-  void reset(const MachineFunction *mf, SlotIndexes *SI,
-             MachineDominatorTree *MDT, VNInfo::Allocator *VNIA);
-
-  //===--------------------------------------------------------------------===//
-  // Mid-level interface.
-  //===--------------------------------------------------------------------===//
-  //
-  // Modify existing live ranges.
-  //
-
-  /// Extend the live range of @p LR to reach @p Use.
-  ///
-  /// The existing values in @p LR must be live so they jointly dominate @p Use.
-  /// If @p Use is not dominated by a single existing value, PHI-defs are
-  /// inserted as required to preserve SSA form.
-  ///
-  /// PhysReg, when set, is used to verify live-in lists on basic blocks.
-  void extend(LiveRange &LR, SlotIndex Use, unsigned PhysReg,
-              ArrayRef<SlotIndex> Undefs);
-
-  /// createDeadDefs - Create a dead def in LI for every def operand of Reg.
-  /// Each instruction defining Reg gets a new VNInfo with a corresponding
-  /// minimal live range.
-  void createDeadDefs(LiveRange &LR, unsigned Reg);
-
-  /// Extend the live range of @p LR to reach all uses of Reg.
-  ///
-  /// All uses must be jointly dominated by existing liveness.  PHI-defs are
-  /// inserted as needed to preserve SSA form.
-  void extendToUses(LiveRange &LR, unsigned PhysReg) {
-    extendToUses(LR, PhysReg, LaneBitmask::getAll());
-  }
-
-  /// Calculates liveness for the register specified in live interval @p LI.
-  /// Creates subregister live ranges as needed if subreg liveness tracking is
-  /// enabled.
-  void calculate(LiveInterval &LI, bool TrackSubRegs);
-
-  /// For live interval \p LI with correct SubRanges construct matching
-  /// information for the main live range. Expects the main live range to not
-  /// have any segments or value numbers.
-  void constructMainRangeFromSubranges(LiveInterval &LI);
-
-  //===--------------------------------------------------------------------===//
-  // Low-level interface.
-  //===--------------------------------------------------------------------===//
-  //
-  // These functions can be used to compute live ranges where the live-in and
-  // live-out blocks are already known, but the SSA value in each block is
-  // unknown.
-  //
-  // After calling reset(), add known live-out values and known live-in blocks.
-  // Then call calculateValues() to compute the actual value that is
-  // live-in to each block, and add liveness to the live ranges.
-  //
-
-  /// setLiveOutValue - Indicate that VNI is live out from MBB.  The
-  /// calculateValues() function will not add liveness for MBB, the caller
-  /// should take care of that.
-  ///
-  /// VNI may be null only if MBB is a live-through block also passed to
-  /// addLiveInBlock().
-  void setLiveOutValue(MachineBasicBlock *MBB, VNInfo *VNI) {
-    Seen.set(MBB->getNumber());
-    Map[MBB] = LiveOutPair(VNI, nullptr);
-  }
-
-  /// addLiveInBlock - Add a block with an unknown live-in value.  This
-  /// function can only be called once per basic block.  Once the live-in value
-  /// has been determined, calculateValues() will add liveness to LI.
-  ///
-  /// @param LR      The live range that is live-in to the block.
-  /// @param DomNode The domtree node for the block.
-  /// @param Kill    Index in block where LI is killed.  If the value is
-  ///                live-through, set Kill = SLotIndex() and also call
-  ///                setLiveOutValue(MBB, 0).
-  void addLiveInBlock(LiveRange &LR,
-                      MachineDomTreeNode *DomNode,
-                      SlotIndex Kill = SlotIndex()) {
-    LiveIn.push_back(LiveInBlock(LR, DomNode, Kill));
-  }
-
-  /// calculateValues - Calculate the value that will be live-in to each block
-  /// added with addLiveInBlock.  Add PHI-def values as needed to preserve SSA
-  /// form.  Add liveness to all live-in blocks up to the Kill point, or the
-  /// whole block for live-through blocks.
-  ///
-  /// Every predecessor of a live-in block must have been given a value with
-  /// setLiveOutValue, the value may be null for live-trough blocks.
-  void calculateValues();
-
-  /// A diagnostic function to check if the end of the block @p MBB is
-  /// jointly dominated by the blocks corresponding to the slot indices
-  /// in @p Defs. This function is mainly for use in self-verification
-  /// checks.
-  LLVM_ATTRIBUTE_UNUSED
-  static bool isJointlyDominated(const MachineBasicBlock *MBB,
-                                 ArrayRef<SlotIndex> Defs,
-                                 const SlotIndexes &Indexes);
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_CODEGEN_LIVERANGECALC_H
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index 882e562ba95c..34bac082bcd7 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -32,7 +32,7 @@ void LiveRangeEdit::Delegate::anchor() { }
 
 LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg,
                                                      bool createSubRanges) {
-  unsigned VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+  Register VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
   if (VRM)
     VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
 
@@ -52,7 +52,7 @@ LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg,
 }
 
 unsigned LiveRangeEdit::createFrom(unsigned OldReg) {
-  unsigned VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+  Register VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
   if (VRM) {
     VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
   }
@@ -114,7 +114,7 @@ bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
       continue;
 
     // We can't remat physreg uses, unless it is a constant.
-    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+    if (Register::isPhysicalRegister(MO.getReg())) {
       if (MRI.isConstantPhysReg(MO.getReg()))
         continue;
       return false;
@@ -232,7 +232,7 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   LLVM_DEBUG(dbgs() << "                folded: " << *FoldMI);
   LIS.ReplaceMachineInstrInMaps(*UseMI, *FoldMI);
   if (UseMI->isCall())
-    UseMI->getMF()->updateCallSiteInfo(UseMI, FoldMI);
+    UseMI->getMF()->moveCallSiteInfo(UseMI, FoldMI);
   UseMI->eraseFromParent();
   DefMI->addRegisterDead(LI->reg, nullptr);
   Dead.push_back(DefMI);
@@ -308,8 +308,8 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
          MOE = MI->operands_end(); MOI != MOE; ++MOI) {
     if (!MOI->isReg())
       continue;
-    unsigned Reg = MOI->getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+    Register Reg = MOI->getReg();
+    if (!Register::isVirtualRegister(Reg)) {
       // Check if MI reads any unreserved physregs.
       if (Reg && MOI->readsReg() && !MRI.isReserved(Reg))
         ReadsPhysRegs = true;
@@ -349,7 +349,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
     // Remove all operands that aren't physregs.
     for (unsigned i = MI->getNumOperands(); i; --i) {
       const MachineOperand &MO = MI->getOperand(i-1);
-      if (MO.isReg() && TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      if (MO.isReg() && Register::isPhysicalRegister(MO.getReg()))
         continue;
       MI->RemoveOperand(i-1);
     }
diff --git a/lib/CodeGen/LiveRangeShrink.cpp b/lib/CodeGen/LiveRangeShrink.cpp
index 8818f1ce0ad9..cbf112ee2bd5 100644
--- a/lib/CodeGen/LiveRangeShrink.cpp
+++ b/lib/CodeGen/LiveRangeShrink.cpp
@@ -172,10 +172,10 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
       for (const MachineOperand &MO : MI.operands()) {
         if (!MO.isReg() || MO.isDead() || MO.isDebug())
           continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         // Do not move the instruction if it def/uses a physical register,
         // unless it is a constant physical register or a noreg.
-        if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+        if (!Register::isVirtualRegister(Reg)) {
           if (!Reg || MRI.isConstantPhysReg(Reg))
             continue;
           Insert = nullptr;
diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp
index ce99e5535c25..72c79e5f8a75 100644
--- a/lib/CodeGen/LiveRegMatrix.cpp
+++ b/lib/CodeGen/LiveRegMatrix.cpp
@@ -118,7 +118,7 @@ void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) {
 }
 
 void LiveRegMatrix::unassign(LiveInterval &VirtReg) {
-  unsigned PhysReg = VRM->getPhys(VirtReg.reg);
+  Register PhysReg = VRM->getPhys(VirtReg.reg);
   LLVM_DEBUG(dbgs() << "unassigning " << printReg(VirtReg.reg, TRI) << " from "
                     << printReg(PhysReg, TRI) << ':');
   VRM->clearVirt(VirtReg.reg);
diff --git a/lib/CodeGen/LiveRegUnits.cpp b/lib/CodeGen/LiveRegUnits.cpp
index 6afb7fb7aa11..97763def1f40 100644
--- a/lib/CodeGen/LiveRegUnits.cpp
+++ b/lib/CodeGen/LiveRegUnits.cpp
@@ -47,8 +47,8 @@ void LiveRegUnits::stepBackward(const MachineInstr &MI) {
     if (O->isReg()) {
       if (!O->isDef() || O->isDebug())
         continue;
-      unsigned Reg = O->getReg();
-      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      Register Reg = O->getReg();
+      if (!Register::isPhysicalRegister(Reg))
         continue;
       removeReg(Reg);
     } else if (O->isRegMask())
@@ -59,8 +59,8 @@ void LiveRegUnits::stepBackward(const MachineInstr &MI) {
   for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
     if (!O->isReg() || !O->readsReg() || O->isDebug())
       continue;
-    unsigned Reg = O->getReg();
-    if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+    Register Reg = O->getReg();
+    if (!Register::isPhysicalRegister(Reg))
       continue;
     addReg(Reg);
   }
@@ -70,8 +70,8 @@ void LiveRegUnits::accumulate(const MachineInstr &MI) {
   // Add defs, uses and regmask clobbers to the set.
   for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
     if (O->isReg()) {
-      unsigned Reg = O->getReg();
-      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      Register Reg = O->getReg();
+      if (!Register::isPhysicalRegister(Reg))
         continue;
       if (!O->isDef() && !O->readsReg())
         continue;
diff --git a/lib/CodeGen/LiveStacks.cpp b/lib/CodeGen/LiveStacks.cpp
index f55977d72723..8df84ebf4f06 100644
--- a/lib/CodeGen/LiveStacks.cpp
+++ b/lib/CodeGen/LiveStacks.cpp
@@ -58,9 +58,10 @@ LiveStacks::getOrCreateInterval(int Slot, const TargetRegisterClass *RC) {
   assert(Slot >= 0 && "Spill slot indice must be >= 0");
   SS2IntervalMap::iterator I = S2IMap.find(Slot);
   if (I == S2IMap.end()) {
-    I = S2IMap.emplace(std::piecewise_construct, std::forward_as_tuple(Slot),
-                       std::forward_as_tuple(
-                           TargetRegisterInfo::index2StackSlot(Slot), 0.0F))
+    I = S2IMap
+            .emplace(
+                std::piecewise_construct, std::forward_as_tuple(Slot),
+                std::forward_as_tuple(Register::index2StackSlot(Slot), 0.0F))
             .first;
     S2RCMap.insert(std::make_pair(Slot, RC));
   } else {
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index aaff982ef1b0..9bd55c6f750f 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -26,6 +26,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -82,7 +83,7 @@ LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const {
 
 /// getVarInfo - Get (possibly creating) a VarInfo object for the given vreg.
 LiveVariables::VarInfo &LiveVariables::getVarInfo(unsigned RegIdx) {
-  assert(TargetRegisterInfo::isVirtualRegister(RegIdx) &&
+  assert(Register::isVirtualRegister(RegIdx) &&
          "getVarInfo: not a virtual register!");
   VirtRegInfo.grow(RegIdx);
   return VirtRegInfo[RegIdx];
@@ -214,7 +215,7 @@ MachineInstr *LiveVariables::FindLastPartialDef(unsigned Reg,
     MachineOperand &MO = LastDef->getOperand(i);
     if (!MO.isReg() || !MO.isDef() || MO.getReg() == 0)
       continue;
-    unsigned DefReg = MO.getReg();
+    Register DefReg = MO.getReg();
     if (TRI->isSubRegister(Reg, DefReg)) {
       for (MCSubRegIterator SubRegs(DefReg, TRI, /*IncludeSelf=*/true);
            SubRegs.isValid(); ++SubRegs)
@@ -519,10 +520,9 @@ void LiveVariables::runOnInstr(MachineInstr &MI,
     }
     if (!MO.isReg() || MO.getReg() == 0)
       continue;
-    unsigned MOReg = MO.getReg();
+    Register MOReg = MO.getReg();
     if (MO.isUse()) {
-      if (!(TargetRegisterInfo::isPhysicalRegister(MOReg) &&
-            MRI->isReserved(MOReg)))
+      if (!(Register::isPhysicalRegister(MOReg) && MRI->isReserved(MOReg)))
         MO.setIsKill(false);
       if (MO.readsReg())
         UseRegs.push_back(MOReg);
@@ -530,8 +530,7 @@ void LiveVariables::runOnInstr(MachineInstr &MI,
       assert(MO.isDef());
       // FIXME: We should not remove any dead flags. However the MIPS RDDSP
       // instruction needs it at the moment: http://llvm.org/PR27116.
-      if (TargetRegisterInfo::isPhysicalRegister(MOReg) &&
-          !MRI->isReserved(MOReg))
+      if (Register::isPhysicalRegister(MOReg) && !MRI->isReserved(MOReg))
         MO.setIsDead(false);
       DefRegs.push_back(MOReg);
     }
@@ -541,7 +540,7 @@ void LiveVariables::runOnInstr(MachineInstr &MI,
   // Process all uses.
   for (unsigned i = 0, e = UseRegs.size(); i != e; ++i) {
     unsigned MOReg = UseRegs[i];
-    if (TargetRegisterInfo::isVirtualRegister(MOReg))
+    if (Register::isVirtualRegister(MOReg))
       HandleVirtRegUse(MOReg, MBB, MI);
     else if (!MRI->isReserved(MOReg))
       HandlePhysRegUse(MOReg, MI);
@@ -554,7 +553,7 @@ void LiveVariables::runOnInstr(MachineInstr &MI,
   // Process all defs.
   for (unsigned i = 0, e = DefRegs.size(); i != e; ++i) {
     unsigned MOReg = DefRegs[i];
-    if (TargetRegisterInfo::isVirtualRegister(MOReg))
+    if (Register::isVirtualRegister(MOReg))
       HandleVirtRegDef(MOReg, MI);
     else if (!MRI->isReserved(MOReg))
       HandlePhysRegDef(MOReg, &MI, Defs);
@@ -566,7 +565,7 @@ void LiveVariables::runOnBlock(MachineBasicBlock *MBB, const unsigned NumRegs) {
   // Mark live-in registers as live-in.
   SmallVector<unsigned, 4> Defs;
   for (const auto &LI : MBB->liveins()) {
-    assert(TargetRegisterInfo::isPhysicalRegister(LI.PhysReg) &&
+    assert(Register::isPhysicalRegister(LI.PhysReg) &&
            "Cannot have a live-in virtual register!");
     HandlePhysRegDef(LI.PhysReg, nullptr, Defs);
   }
@@ -654,7 +653,7 @@ bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
   // Convert and transfer the dead / killed information we have gathered into
   // VirtRegInfo onto MI's.
   for (unsigned i = 0, e1 = VirtRegInfo.size(); i != e1; ++i) {
-    const unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    const unsigned Reg = Register::index2VirtReg(i);
     for (unsigned j = 0, e2 = VirtRegInfo[Reg].Kills.size(); j != e2; ++j)
       if (VirtRegInfo[Reg].Kills[j] == MRI->getVRegDef(Reg))
         VirtRegInfo[Reg].Kills[j]->addRegisterDead(Reg, TRI);
@@ -692,8 +691,8 @@ void LiveVariables::removeVirtualRegistersKilled(MachineInstr &MI) {
     MachineOperand &MO = MI.getOperand(i);
     if (MO.isReg() && MO.isKill()) {
       MO.setIsKill(false);
-      unsigned Reg = MO.getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      Register Reg = MO.getReg();
+      if (Register::isVirtualRegister(Reg)) {
         bool removed = getVarInfo(Reg).removeKill(MI);
         assert(removed && "kill not in register's VarInfo?");
         (void)removed;
@@ -783,7 +782,7 @@ void LiveVariables::addNewBlock(MachineBasicBlock *BB,
   for (; BBI != BBE; ++BBI) {
     for (MachineInstr::mop_iterator I = BBI->operands_begin(),
          E = BBI->operands_end(); I != E; ++I) {
-      if (I->isReg() && TargetRegisterInfo::isVirtualRegister(I->getReg())) {
+      if (I->isReg() && Register::isVirtualRegister(I->getReg())) {
         if (I->isDef())
           Defs.insert(I->getReg());
         else if (I->isKill())
@@ -794,7 +793,7 @@ void LiveVariables::addNewBlock(MachineBasicBlock *BB,
 
   // Update info for all live variables
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
 
     // If the Defs is defined in the successor it can't be live in BB.
     if (Defs.count(Reg))
diff --git a/lib/CodeGen/LocalStackSlotAllocation.cpp b/lib/CodeGen/LocalStackSlotAllocation.cpp
index b14d76a585f7..2392d4d00b56 100644
--- a/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -261,7 +261,7 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
 
   // Remember how big this blob of stack space is
   MFI.setLocalFrameSize(Offset);
-  MFI.setLocalFrameMaxAlign(MaxAlign);
+  MFI.setLocalFrameMaxAlign(assumeAligned(MaxAlign));
 }
 
 static inline bool
@@ -351,6 +351,14 @@ bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
     assert(MFI.isObjectPreAllocated(FrameIdx) &&
            "Only pre-allocated locals expected!");
 
+    // We need to keep the references to the stack protector slot through frame
+    // index operands so that it gets resolved by PEI rather than this pass.
+    // This avoids accesses to the stack protector though virtual base
+    // registers, and forces PEI to address it using fp/sp/bp.
+    if (MFI.hasStackProtectorIndex() &&
+        FrameIdx == MFI.getStackProtectorIndex())
+      continue;
+
     LLVM_DEBUG(dbgs() << "Considering: " << MI);
 
     unsigned idx = 0;
diff --git a/lib/CodeGen/LowerEmuTLS.cpp b/lib/CodeGen/LowerEmuTLS.cpp
index c8cf6abda4fc..ed48365b0102 100644
--- a/lib/CodeGen/LowerEmuTLS.cpp
+++ b/lib/CodeGen/LowerEmuTLS.cpp
@@ -142,7 +142,7 @@ bool LowerEmuTLS::addEmuTlsVar(Module &M, const GlobalVariable *GV) {
     assert(EmuTlsTmplVar && "Failed to create emualted TLS initializer");
     EmuTlsTmplVar->setConstant(true);
     EmuTlsTmplVar->setInitializer(const_cast<Constant*>(InitValue));
-    EmuTlsTmplVar->setAlignment(GVAlignment);
+    EmuTlsTmplVar->setAlignment(Align(GVAlignment));
     copyLinkageVisibility(M, GV, EmuTlsTmplVar);
   }
 
@@ -155,9 +155,8 @@ bool LowerEmuTLS::addEmuTlsVar(Module &M, const GlobalVariable *GV) {
   ArrayRef<Constant*> ElementValueArray(ElementValues, 4);
   EmuTlsVar->setInitializer(
       ConstantStruct::get(EmuTlsVarType, ElementValueArray));
-  unsigned MaxAlignment = std::max(
-      DL.getABITypeAlignment(WordType),
-      DL.getABITypeAlignment(VoidPtrType));
+  Align MaxAlignment(std::max(DL.getABITypeAlignment(WordType),
+                              DL.getABITypeAlignment(VoidPtrType)));
   EmuTlsVar->setAlignment(MaxAlignment);
   return true;
 }
diff --git a/lib/CodeGen/MIRCanonicalizerPass.cpp b/lib/CodeGen/MIRCanonicalizerPass.cpp
index f49bc854e23f..c9bb5461aa3c 100644
--- a/lib/CodeGen/MIRCanonicalizerPass.cpp
+++ b/lib/CodeGen/MIRCanonicalizerPass.cpp
@@ -23,12 +23,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MIRVRegNamerUtils.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include <queue>
@@ -71,28 +73,6 @@ public:
 
 } // end anonymous namespace
 
-enum VRType { RSE_Reg = 0, RSE_FrameIndex, RSE_NewCandidate };
-class TypedVReg {
-  VRType type;
-  unsigned reg;
-
-public:
-  TypedVReg(unsigned reg) : type(RSE_Reg), reg(reg) {}
-  TypedVReg(VRType type) : type(type), reg(~0U) {
-    assert(type != RSE_Reg && "Expected a non-register type.");
-  }
-
-  bool isReg() const { return type == RSE_Reg; }
-  bool isFrameIndex() const { return type == RSE_FrameIndex; }
-  bool isCandidate() const { return type == RSE_NewCandidate; }
-
-  VRType getType() const { return type; }
-  unsigned getReg() const {
-    assert(this->isReg() && "Expected a virtual or physical register.");
-    return reg;
-  }
-};
-
 char MIRCanonicalizer::ID;
 
 char &llvm::MIRCanonicalizerID = MIRCanonicalizer::ID;
@@ -190,7 +170,7 @@ static bool rescheduleCanonically(unsigned &PseudoIdempotentInstCount,
       if (!MO.isReg())
         continue;
 
-      if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      if (Register::isVirtualRegister(MO.getReg()))
         continue;
 
       if (!MO.isDef())
@@ -207,7 +187,7 @@ static bool rescheduleCanonically(unsigned &PseudoIdempotentInstCount,
       continue;
 
     MachineOperand &MO = II->getOperand(0);
-    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
       continue;
     if (!MO.isDef())
       continue;
@@ -220,7 +200,7 @@ static bool rescheduleCanonically(unsigned &PseudoIdempotentInstCount,
       }
 
       if (II->getOperand(i).isReg()) {
-        if (!TargetRegisterInfo::isVirtualRegister(II->getOperand(i).getReg()))
+        if (!Register::isVirtualRegister(II->getOperand(i).getReg()))
           if (llvm::find(PhysRegDefs, II->getOperand(i).getReg()) ==
               PhysRegDefs.end()) {
             continue;
@@ -340,12 +320,12 @@ static bool propagateLocalCopies(MachineBasicBlock *MBB) {
     if (!MI->getOperand(1).isReg())
       continue;
 
-    const unsigned Dst = MI->getOperand(0).getReg();
-    const unsigned Src = MI->getOperand(1).getReg();
+    const Register Dst = MI->getOperand(0).getReg();
+    const Register Src = MI->getOperand(1).getReg();
 
-    if (!TargetRegisterInfo::isVirtualRegister(Dst))
+    if (!Register::isVirtualRegister(Dst))
       continue;
-    if (!TargetRegisterInfo::isVirtualRegister(Src))
+    if (!Register::isVirtualRegister(Src))
       continue;
     // Not folding COPY instructions if regbankselect has not set the RCs.
     // Why are we only considering Register Classes? Because the verifier
@@ -370,258 +350,6 @@ static bool propagateLocalCopies(MachineBasicBlock *MBB) {
   return Changed;
 }
 
-/// Here we find our candidates. What makes an interesting candidate?
-/// An candidate for a canonicalization tree root is normally any kind of
-/// instruction that causes side effects such as a store to memory or a copy to
-/// a physical register or a return instruction. We use these as an expression
-/// tree root that we walk inorder to build a canonical walk which should result
-/// in canoncal vreg renaming.
-static std::vector<MachineInstr *> populateCandidates(MachineBasicBlock *MBB) {
-  std::vector<MachineInstr *> Candidates;
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-
-  for (auto II = MBB->begin(), IE = MBB->end(); II != IE; ++II) {
-    MachineInstr *MI = &*II;
-
-    bool DoesMISideEffect = false;
-
-    if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg()) {
-      const unsigned Dst = MI->getOperand(0).getReg();
-      DoesMISideEffect |= !TargetRegisterInfo::isVirtualRegister(Dst);
-
-      for (auto UI = MRI.use_begin(Dst); UI != MRI.use_end(); ++UI) {
-        if (DoesMISideEffect)
-          break;
-        DoesMISideEffect |= (UI->getParent()->getParent() != MI->getParent());
-      }
-    }
-
-    if (!MI->mayStore() && !MI->isBranch() && !DoesMISideEffect)
-      continue;
-
-    LLVM_DEBUG(dbgs() << "Found Candidate:  "; MI->dump(););
-    Candidates.push_back(MI);
-  }
-
-  return Candidates;
-}
-
-static void doCandidateWalk(std::vector<TypedVReg> &VRegs,
-                            std::queue<TypedVReg> &RegQueue,
-                            std::vector<MachineInstr *> &VisitedMIs,
-                            const MachineBasicBlock *MBB) {
-
-  const MachineFunction &MF = *MBB->getParent();
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-
-  while (!RegQueue.empty()) {
-
-    auto TReg = RegQueue.front();
-    RegQueue.pop();
-
-    if (TReg.isFrameIndex()) {
-      LLVM_DEBUG(dbgs() << "Popping frame index.\n";);
-      VRegs.push_back(TypedVReg(RSE_FrameIndex));
-      continue;
-    }
-
-    assert(TReg.isReg() && "Expected vreg or physreg.");
-    unsigned Reg = TReg.getReg();
-
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-      LLVM_DEBUG({
-        dbgs() << "Popping vreg ";
-        MRI.def_begin(Reg)->dump();
-        dbgs() << "\n";
-      });
-
-      if (!llvm::any_of(VRegs, [&](const TypedVReg &TR) {
-            return TR.isReg() && TR.getReg() == Reg;
-          })) {
-        VRegs.push_back(TypedVReg(Reg));
-      }
-    } else {
-      LLVM_DEBUG(dbgs() << "Popping physreg.\n";);
-      VRegs.push_back(TypedVReg(Reg));
-      continue;
-    }
-
-    for (auto RI = MRI.def_begin(Reg), RE = MRI.def_end(); RI != RE; ++RI) {
-      MachineInstr *Def = RI->getParent();
-
-      if (Def->getParent() != MBB)
-        continue;
-
-      if (llvm::any_of(VisitedMIs,
-                       [&](const MachineInstr *VMI) { return Def == VMI; })) {
-        break;
-      }
-
-      LLVM_DEBUG({
-        dbgs() << "\n========================\n";
-        dbgs() << "Visited MI: ";
-        Def->dump();
-        dbgs() << "BB Name: " << Def->getParent()->getName() << "\n";
-        dbgs() << "\n========================\n";
-      });
-      VisitedMIs.push_back(Def);
-      for (unsigned I = 1, E = Def->getNumOperands(); I != E; ++I) {
-
-        MachineOperand &MO = Def->getOperand(I);
-        if (MO.isFI()) {
-          LLVM_DEBUG(dbgs() << "Pushing frame index.\n";);
-          RegQueue.push(TypedVReg(RSE_FrameIndex));
-        }
-
-        if (!MO.isReg())
-          continue;
-        RegQueue.push(TypedVReg(MO.getReg()));
-      }
-    }
-  }
-}
-
-namespace {
-class NamedVRegCursor {
-  MachineRegisterInfo &MRI;
-  unsigned virtualVRegNumber;
-
-public:
-  NamedVRegCursor(MachineRegisterInfo &MRI) : MRI(MRI), virtualVRegNumber(0) {}
-
-  void SkipVRegs() {
-    unsigned VRegGapIndex = 1;
-    if (!virtualVRegNumber) {
-      VRegGapIndex = 0;
-      virtualVRegNumber = MRI.createIncompleteVirtualRegister();
-    }
-    const unsigned VR_GAP = (++VRegGapIndex * 1000);
-
-    unsigned I = virtualVRegNumber;
-    const unsigned E = (((I + VR_GAP) / VR_GAP) + 1) * VR_GAP;
-
-    virtualVRegNumber = E;
-  }
-
-  unsigned getVirtualVReg() const { return virtualVRegNumber; }
-
-  unsigned incrementVirtualVReg(unsigned incr = 1) {
-    virtualVRegNumber += incr;
-    return virtualVRegNumber;
-  }
-
-  unsigned createVirtualRegister(unsigned VReg) {
-    if (!virtualVRegNumber)
-      SkipVRegs();
-    std::string S;
-    raw_string_ostream OS(S);
-    OS << "namedVReg" << (virtualVRegNumber & ~0x80000000);
-    OS.flush();
-    virtualVRegNumber++;
-    if (auto RC = MRI.getRegClassOrNull(VReg))
-      return MRI.createVirtualRegister(RC, OS.str());
-    return MRI.createGenericVirtualRegister(MRI.getType(VReg), OS.str());
-  }
-};
-} // namespace
-
-static std::map<unsigned, unsigned>
-GetVRegRenameMap(const std::vector<TypedVReg> &VRegs,
-                 const std::vector<unsigned> &renamedInOtherBB,
-                 MachineRegisterInfo &MRI, NamedVRegCursor &NVC) {
-  std::map<unsigned, unsigned> VRegRenameMap;
-  bool FirstCandidate = true;
-
-  for (auto &vreg : VRegs) {
-    if (vreg.isFrameIndex()) {
-      // We skip one vreg for any frame index because there is a good chance
-      // (especially when comparing SelectionDAG to GlobalISel generated MIR)
-      // that in the other file we are just getting an incoming vreg that comes
-      // from a copy from a frame index. So it's safe to skip by one.
-      unsigned LastRenameReg = NVC.incrementVirtualVReg();
-      (void)LastRenameReg;
-      LLVM_DEBUG(dbgs() << "Skipping rename for FI " << LastRenameReg << "\n";);
-      continue;
-    } else if (vreg.isCandidate()) {
-
-      // After the first candidate, for every subsequent candidate, we skip mod
-      // 10 registers so that the candidates are more likely to start at the
-      // same vreg number making it more likely that the canonical walk from the
-      // candidate insruction. We don't need to skip from the first candidate of
-      // the BasicBlock because we already skip ahead several vregs for each BB.
-      unsigned LastRenameReg = NVC.getVirtualVReg();
-      if (FirstCandidate)
-        NVC.incrementVirtualVReg(LastRenameReg % 10);
-      FirstCandidate = false;
-      continue;
-    } else if (!TargetRegisterInfo::isVirtualRegister(vreg.getReg())) {
-      unsigned LastRenameReg = NVC.incrementVirtualVReg();
-      (void)LastRenameReg;
-      LLVM_DEBUG({
-        dbgs() << "Skipping rename for Phys Reg " << LastRenameReg << "\n";
-      });
-      continue;
-    }
-
-    auto Reg = vreg.getReg();
-    if (llvm::find(renamedInOtherBB, Reg) != renamedInOtherBB.end()) {
-      LLVM_DEBUG(dbgs() << "Vreg " << Reg
-                        << " already renamed in other BB.\n";);
-      continue;
-    }
-
-    auto Rename = NVC.createVirtualRegister(Reg);
-
-    if (VRegRenameMap.find(Reg) == VRegRenameMap.end()) {
-      LLVM_DEBUG(dbgs() << "Mapping vreg ";);
-      if (MRI.reg_begin(Reg) != MRI.reg_end()) {
-        LLVM_DEBUG(auto foo = &*MRI.reg_begin(Reg); foo->dump(););
-      } else {
-        LLVM_DEBUG(dbgs() << Reg;);
-      }
-      LLVM_DEBUG(dbgs() << " to ";);
-      if (MRI.reg_begin(Rename) != MRI.reg_end()) {
-        LLVM_DEBUG(auto foo = &*MRI.reg_begin(Rename); foo->dump(););
-      } else {
-        LLVM_DEBUG(dbgs() << Rename;);
-      }
-      LLVM_DEBUG(dbgs() << "\n";);
-
-      VRegRenameMap.insert(std::pair<unsigned, unsigned>(Reg, Rename));
-    }
-  }
-
-  return VRegRenameMap;
-}
-
-static bool doVRegRenaming(std::vector<unsigned> &RenamedInOtherBB,
-                           const std::map<unsigned, unsigned> &VRegRenameMap,
-                           MachineRegisterInfo &MRI) {
-  bool Changed = false;
-  for (auto I = VRegRenameMap.begin(), E = VRegRenameMap.end(); I != E; ++I) {
-
-    auto VReg = I->first;
-    auto Rename = I->second;
-
-    RenamedInOtherBB.push_back(Rename);
-
-    std::vector<MachineOperand *> RenameMOs;
-    for (auto &MO : MRI.reg_operands(VReg)) {
-      RenameMOs.push_back(&MO);
-    }
-
-    for (auto *MO : RenameMOs) {
-      Changed = true;
-      MO->setReg(Rename);
-
-      if (!MO->isDef())
-        MO->setIsKill(false);
-    }
-  }
-
-  return Changed;
-}
-
 static bool doDefKillClear(MachineBasicBlock *MBB) {
   bool Changed = false;
 
@@ -646,9 +374,7 @@ static bool doDefKillClear(MachineBasicBlock *MBB) {
 
 static bool runOnBasicBlock(MachineBasicBlock *MBB,
                             std::vector<StringRef> &bbNames,
-                            std::vector<unsigned> &renamedInOtherBB,
-                            unsigned &basicBlockNum, unsigned &VRegGapIndex,
-                            NamedVRegCursor &NVC) {
+                            unsigned &basicBlockNum, NamedVRegCursor &NVC) {
 
   if (CanonicalizeBasicBlockNumber != ~0U) {
     if (CanonicalizeBasicBlockNumber != basicBlockNum++)
@@ -687,74 +413,20 @@ static bool runOnBasicBlock(MachineBasicBlock *MBB,
   Changed |= rescheduleCanonically(IdempotentInstCount, MBB);
   LLVM_DEBUG(dbgs() << "MBB After Scheduling:\n"; MBB->dump(););
 
-  std::vector<MachineInstr *> Candidates = populateCandidates(MBB);
-  std::vector<MachineInstr *> VisitedMIs;
-  llvm::copy(Candidates, std::back_inserter(VisitedMIs));
-
-  std::vector<TypedVReg> VRegs;
-  for (auto candidate : Candidates) {
-    VRegs.push_back(TypedVReg(RSE_NewCandidate));
-
-    std::queue<TypedVReg> RegQueue;
-
-    // Here we walk the vreg operands of a non-root node along our walk.
-    // The root nodes are the original candidates (stores normally).
-    // These are normally not the root nodes (except for the case of copies to
-    // physical registers).
-    for (unsigned i = 1; i < candidate->getNumOperands(); i++) {
-      if (candidate->mayStore() || candidate->isBranch())
-        break;
-
-      MachineOperand &MO = candidate->getOperand(i);
-      if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
-        continue;
-
-      LLVM_DEBUG(dbgs() << "Enqueue register"; MO.dump(); dbgs() << "\n";);
-      RegQueue.push(TypedVReg(MO.getReg()));
-    }
-
-    // Here we walk the root candidates. We start from the 0th operand because
-    // the root is normally a store to a vreg.
-    for (unsigned i = 0; i < candidate->getNumOperands(); i++) {
-
-      if (!candidate->mayStore() && !candidate->isBranch())
-        break;
-
-      MachineOperand &MO = candidate->getOperand(i);
-
-      // TODO: Do we want to only add vregs here?
-      if (!MO.isReg() && !MO.isFI())
-        continue;
-
-      LLVM_DEBUG(dbgs() << "Enqueue Reg/FI"; MO.dump(); dbgs() << "\n";);
-
-      RegQueue.push(MO.isReg() ? TypedVReg(MO.getReg())
-                               : TypedVReg(RSE_FrameIndex));
-    }
-
-    doCandidateWalk(VRegs, RegQueue, VisitedMIs, MBB);
-  }
-
-  // If we have populated no vregs to rename then bail.
-  // The rest of this function does the vreg remaping.
-  if (VRegs.size() == 0)
-    return Changed;
-
-  auto VRegRenameMap = GetVRegRenameMap(VRegs, renamedInOtherBB, MRI, NVC);
-  Changed |= doVRegRenaming(renamedInOtherBB, VRegRenameMap, MRI);
+  Changed |= NVC.renameVRegs(MBB);
 
   // Here we renumber the def vregs for the idempotent instructions from the top
   // of the MachineBasicBlock so that they are named in the order that we sorted
   // them alphabetically. Eventually we wont need SkipVRegs because we will use
   // named vregs instead.
   if (IdempotentInstCount)
-    NVC.SkipVRegs();
+    NVC.skipVRegs();
 
   auto MII = MBB->begin();
   for (unsigned i = 0; i < IdempotentInstCount && MII != MBB->end(); ++i) {
     MachineInstr &MI = *MII++;
     Changed = true;
-    unsigned vRegToRename = MI.getOperand(0).getReg();
+    Register vRegToRename = MI.getOperand(0).getReg();
     auto Rename = NVC.createVirtualRegister(vRegToRename);
 
     std::vector<MachineOperand *> RenameMOs;
@@ -799,9 +471,7 @@ bool MIRCanonicalizer::runOnMachineFunction(MachineFunction &MF) {
       << "\n\n================================================\n\n";);
 
   std::vector<StringRef> BBNames;
-  std::vector<unsigned> RenamedInOtherBB;
 
-  unsigned GapIdx = 0;
   unsigned BBNum = 0;
 
   bool Changed = false;
@@ -809,8 +479,7 @@ bool MIRCanonicalizer::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   NamedVRegCursor NVC(MRI);
   for (auto MBB : RPOList)
-    Changed |=
-        runOnBasicBlock(MBB, BBNames, RenamedInOtherBB, BBNum, GapIdx, NVC);
+    Changed |= runOnBasicBlock(MBB, BBNames, BBNum, NVC);
 
   return Changed;
 }
diff --git a/lib/CodeGen/MIRNamerPass.cpp b/lib/CodeGen/MIRNamerPass.cpp
new file mode 100644
index 000000000000..9d719f3917ce
--- /dev/null
+++ b/lib/CodeGen/MIRNamerPass.cpp
@@ -0,0 +1,77 @@
+//===----------------------- MIRNamer.cpp - MIR Namer ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The purpose of this pass is to rename virtual register operands with the goal
+// of making it easier to author easier to read tests for MIR. This pass reuses
+// the vreg renamer used by MIRCanonicalizerPass.
+//
+// Basic Usage:
+//
+// llc -o - -run-pass mir-namer example.mir
+//
+//===----------------------------------------------------------------------===//
+
+#include "MIRVRegNamerUtils.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+
+using namespace llvm;
+
+namespace llvm {
+extern char &MIRNamerID;
+} // namespace llvm
+
+#define DEBUG_TYPE "mir-namer"
+
+namespace {
+
+class MIRNamer : public MachineFunctionPass {
+public:
+  static char ID;
+  MIRNamer() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "Rename virtual register operands";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    bool Changed = false;
+
+    if (MF.empty())
+      return Changed;
+
+    NamedVRegCursor NVC(MF.getRegInfo());
+
+    ReversePostOrderTraversal<MachineBasicBlock *> RPOT(&*MF.begin());
+    for (auto &MBB : RPOT)
+      Changed |= NVC.renameVRegs(MBB);
+
+    return Changed;
+  }
+};
+
+} // end anonymous namespace
+
+char MIRNamer::ID;
+
+char &llvm::MIRNamerID = MIRNamer::ID;
+
+INITIALIZE_PASS_BEGIN(MIRNamer, "mir-namer", "Rename Register Operands", false,
+                      false)
+
+INITIALIZE_PASS_END(MIRNamer, "mir-namer", "Rename Register Operands", false,
+                    false)
diff --git a/lib/CodeGen/MIRParser/MILexer.cpp b/lib/CodeGen/MIRParser/MILexer.cpp
index 4899bd3f5811..ad5c617623f2 100644
--- a/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/lib/CodeGen/MIRParser/MILexer.cpp
@@ -249,6 +249,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("successors", MIToken::kw_successors)
       .Case("floatpred", MIToken::kw_floatpred)
       .Case("intpred", MIToken::kw_intpred)
+      .Case("shufflemask", MIToken::kw_shufflemask)
       .Case("pre-instr-symbol", MIToken::kw_pre_instr_symbol)
       .Case("post-instr-symbol", MIToken::kw_post_instr_symbol)
       .Case("unknown-size", MIToken::kw_unknown_size)
diff --git a/lib/CodeGen/MIRParser/MILexer.h b/lib/CodeGen/MIRParser/MILexer.h
index 0fe3f9f706db..200f9d026cc8 100644
--- a/lib/CodeGen/MIRParser/MILexer.h
+++ b/lib/CodeGen/MIRParser/MILexer.h
@@ -117,6 +117,7 @@ struct MIToken {
     kw_successors,
     kw_floatpred,
     kw_intpred,
+    kw_shufflemask,
     kw_pre_instr_symbol,
     kw_post_instr_symbol,
     kw_unknown_size,
@@ -146,6 +147,7 @@ struct MIToken {
     IntegerLiteral,
     FloatingPointLiteral,
     HexLiteral,
+    VectorLiteral,
     VirtualRegister,
     ConstantPoolItem,
     JumpTableIndex,
diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp
index c0b800a0b870..6498acc9fa51 100644
--- a/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/lib/CodeGen/MIRParser/MIParser.cpp
@@ -451,6 +451,7 @@ public:
   bool parseBlockAddressOperand(MachineOperand &Dest);
   bool parseIntrinsicOperand(MachineOperand &Dest);
   bool parsePredicateOperand(MachineOperand &Dest);
+  bool parseShuffleMaskOperand(MachineOperand &Dest);
   bool parseTargetIndexOperand(MachineOperand &Dest);
   bool parseCustomRegisterMaskOperand(MachineOperand &Dest);
   bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest);
@@ -640,7 +641,7 @@ bool MIParser::parseBasicBlockDefinition(
     return error(Loc, Twine("redefinition of machine basic block with id #") +
                           Twine(ID));
   if (Alignment)
-    MBB->setAlignment(Alignment);
+    MBB->setAlignment(Align(Alignment));
   if (HasAddressTaken)
     MBB->setHasAddressTaken();
   MBB->setIsEHPad(IsLandingPad);
@@ -1078,7 +1079,7 @@ static const char *printImplicitRegisterFlag(const MachineOperand &MO) {
 
 static std::string getRegisterName(const TargetRegisterInfo *TRI,
                                    unsigned Reg) {
-  assert(TargetRegisterInfo::isPhysicalRegister(Reg) && "expected phys reg");
+  assert(Register::isPhysicalRegister(Reg) && "expected phys reg");
   return StringRef(TRI->getName(Reg)).lower();
 }
 
@@ -1408,11 +1409,11 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest,
   if (Token.is(MIToken::dot)) {
     if (parseSubRegisterIndex(SubReg))
       return true;
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    if (!Register::isVirtualRegister(Reg))
       return error("subregister index expects a virtual register");
   }
   if (Token.is(MIToken::colon)) {
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    if (!Register::isVirtualRegister(Reg))
       return error("register class specification expects a virtual register");
     lex();
     if (parseRegisterClassOrBank(*RegInfo))
@@ -1436,12 +1437,13 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest,
         if (MRI.getType(Reg).isValid() && MRI.getType(Reg) != Ty)
           return error("inconsistent type for generic virtual register");
 
+        MRI.setRegClassOrRegBank(Reg, static_cast<RegisterBank *>(nullptr));
         MRI.setType(Reg, Ty);
       }
     }
   } else if (consumeIfPresent(MIToken::lparen)) {
     // Virtual registers may have a tpe with GlobalISel.
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    if (!Register::isVirtualRegister(Reg))
       return error("unexpected type on physical register");
 
     LLT Ty;
@@ -1454,8 +1456,9 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest,
     if (MRI.getType(Reg).isValid() && MRI.getType(Reg) != Ty)
       return error("inconsistent type for generic virtual register");
 
+    MRI.setRegClassOrRegBank(Reg, static_cast<RegisterBank *>(nullptr));
     MRI.setType(Reg, Ty);
-  } else if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+  } else if (Register::isVirtualRegister(Reg)) {
     // Generic virtual registers must have a type.
     // If we end up here this means the type hasn't been specified and
     // this is bad!
@@ -2285,6 +2288,49 @@ bool MIParser::parsePredicateOperand(MachineOperand &Dest) {
   return false;
 }
 
+bool MIParser::parseShuffleMaskOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::kw_shufflemask));
+
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return error("expected syntax shufflemask(<integer or undef>, ...)");
+
+  SmallVector<Constant *, 32> ShufMask;
+  LLVMContext &Ctx = MF.getFunction().getContext();
+  Type *I32Ty = Type::getInt32Ty(Ctx);
+
+  bool AllZero = true;
+  bool AllUndef = true;
+
+  do {
+    if (Token.is(MIToken::kw_undef)) {
+      ShufMask.push_back(UndefValue::get(I32Ty));
+      AllZero = false;
+    } else if (Token.is(MIToken::IntegerLiteral)) {
+      AllUndef = false;
+      const APSInt &Int = Token.integerValue();
+      if (!Int.isNullValue())
+        AllZero = false;
+      ShufMask.push_back(ConstantInt::get(I32Ty, Int.getExtValue()));
+    } else
+      return error("expected integer constant");
+
+    lex();
+  } while (consumeIfPresent(MIToken::comma));
+
+  if (expectAndConsume(MIToken::rparen))
+    return error("shufflemask should be terminated by ')'.");
+
+  if (AllZero || AllUndef) {
+    VectorType *VT = VectorType::get(I32Ty, ShufMask.size());
+    Constant *C = AllZero ? Constant::getNullValue(VT) : UndefValue::get(VT);
+    Dest = MachineOperand::CreateShuffleMask(C);
+  } else
+    Dest = MachineOperand::CreateShuffleMask(ConstantVector::get(ShufMask));
+
+  return false;
+}
+
 bool MIParser::parseTargetIndexOperand(MachineOperand &Dest) {
   assert(Token.is(MIToken::kw_target_index));
   lex();
@@ -2432,6 +2478,8 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest,
   case MIToken::kw_floatpred:
   case MIToken::kw_intpred:
     return parsePredicateOperand(Dest);
+  case MIToken::kw_shufflemask:
+    return parseShuffleMaskOperand(Dest);
   case MIToken::Error:
     return true;
   case MIToken::Identifier:
diff --git a/lib/CodeGen/MIRParser/MIRParser.cpp b/lib/CodeGen/MIRParser/MIRParser.cpp
index b242934def80..55fac93d8991 100644
--- a/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -216,7 +216,7 @@ std::unique_ptr<Module> MIRParserImpl::parseIRModule() {
       return nullptr;
     // Create an empty module when the MIR file is empty.
     NoMIRDocuments = true;
-    return llvm::make_unique<Module>(Filename, Context);
+    return std::make_unique<Module>(Filename, Context);
   }
 
   std::unique_ptr<Module> M;
@@ -236,7 +236,7 @@ std::unique_ptr<Module> MIRParserImpl::parseIRModule() {
       NoMIRDocuments = true;
   } else {
     // Create an new, empty module.
-    M = llvm::make_unique<Module>(Filename, Context);
+    M = std::make_unique<Module>(Filename, Context);
     NoLLVMIR = true;
   }
   return M;
@@ -306,7 +306,7 @@ bool MIRParserImpl::parseMachineFunction(Module &M, MachineModuleInfo &MMI) {
 static bool isSSA(const MachineFunction &MF) {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    unsigned Reg = Register::index2VirtReg(I);
     if (!MRI.hasOneDef(Reg) && !MRI.def_empty(Reg))
       return false;
   }
@@ -355,10 +355,10 @@ bool MIRParserImpl::initializeCallSiteInfo(
     if (MILoc.Offset >= CallB->size())
       return error(Twine(MF.getName()) +
                    Twine(" call instruction offset out of range.") +
-                   "Unable to reference instruction at bb: " +
+                   " Unable to reference instruction at bb: " +
                    Twine(MILoc.BlockNum) + " at offset:" + Twine(MILoc.Offset));
-    auto CallI = std::next(CallB->begin(), MILoc.Offset);
-    if (!CallI->isCall())
+    auto CallI = std::next(CallB->instr_begin(), MILoc.Offset);
+    if (!CallI->isCall(MachineInstr::IgnoreBundle))
       return error(Twine(MF.getName()) +
                    Twine(" call site info should reference call "
                          "instruction. Instruction at bb:") +
@@ -393,7 +393,7 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
   }
 
   if (YamlMF.Alignment)
-    MF.setAlignment(YamlMF.Alignment);
+    MF.setAlignment(Align(YamlMF.Alignment));
   MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice);
   MF.setHasWinCFI(YamlMF.HasWinCFI);
 
@@ -949,6 +949,6 @@ llvm::createMIRParser(std::unique_ptr<MemoryBuffer> Contents,
             "Can't read MIR with a Context that discards named Values")));
     return nullptr;
   }
-  return llvm::make_unique<MIRParser>(
-      llvm::make_unique<MIRParserImpl>(std::move(Contents), Filename, Context));
+  return std::make_unique<MIRParser>(
+      std::make_unique<MIRParserImpl>(std::move(Contents), Filename, Context));
 }
diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp
index 0a95a0ced0f5..1a4e21ac06a9 100644
--- a/lib/CodeGen/MIRPrinter.cpp
+++ b/lib/CodeGen/MIRPrinter.cpp
@@ -197,7 +197,7 @@ void MIRPrinter::print(const MachineFunction &MF) {
 
   yaml::MachineFunction YamlMF;
   YamlMF.Name = MF.getName();
-  YamlMF.Alignment = MF.getAlignment();
+  YamlMF.Alignment = MF.getAlignment().value();
   YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice();
   YamlMF.HasWinCFI = MF.hasWinCFI();
 
@@ -290,7 +290,7 @@ void MIRPrinter::convert(yaml::MachineFunction &MF,
 
   // Print the virtual register definitions.
   for (unsigned I = 0, E = RegInfo.getNumVirtRegs(); I < E; ++I) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    unsigned Reg = Register::index2VirtReg(I);
     yaml::VirtualRegisterDefinition VReg;
     VReg.ID = I;
     if (RegInfo.getVRegName(Reg) != "")
@@ -473,10 +473,11 @@ void MIRPrinter::convertCallSiteObjects(yaml::MachineFunction &YMF,
     yaml::CallSiteInfo::MachineInstrLoc CallLocation;
 
     // Prepare instruction position.
-    MachineBasicBlock::const_iterator CallI = CSInfo.first->getIterator();
+    MachineBasicBlock::const_instr_iterator CallI = CSInfo.first->getIterator();
     CallLocation.BlockNum = CallI->getParent()->getNumber();
     // Get call instruction offset from the beginning of block.
-    CallLocation.Offset = std::distance(CallI->getParent()->begin(), CallI);
+    CallLocation.Offset =
+        std::distance(CallI->getParent()->instr_begin(), CallI);
     YmlCS.CallLocation = CallLocation;
     // Construct call arguments and theirs forwarding register info.
     for (auto ArgReg : CSInfo.second) {
@@ -628,9 +629,9 @@ void MIPrinter::print(const MachineBasicBlock &MBB) {
     OS << "landing-pad";
     HasAttributes = true;
   }
-  if (MBB.getAlignment()) {
+  if (MBB.getAlignment() != Align::None()) {
     OS << (HasAttributes ? ", " : " (");
-    OS << "align " << MBB.getAlignment();
+    OS << "align " << MBB.getAlignment().value();
     HasAttributes = true;
   }
   if (HasAttributes)
@@ -842,7 +843,8 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
   case MachineOperand::MO_CFIIndex:
   case MachineOperand::MO_IntrinsicID:
   case MachineOperand::MO_Predicate:
-  case MachineOperand::MO_BlockAddress: {
+  case MachineOperand::MO_BlockAddress:
+  case MachineOperand::MO_ShuffleMask: {
     unsigned TiedOperandIdx = 0;
     if (ShouldPrintRegisterTies && Op.isReg() && Op.isTied() && !Op.isDef())
       TiedOperandIdx = Op.getParent()->findTiedOperandIdx(OpIdx);
diff --git a/lib/CodeGen/MIRVRegNamerUtils.cpp b/lib/CodeGen/MIRVRegNamerUtils.cpp
new file mode 100644
index 000000000000..6629000f468f
--- /dev/null
+++ b/lib/CodeGen/MIRVRegNamerUtils.cpp
@@ -0,0 +1,348 @@
+//===---------- MIRVRegNamerUtils.cpp - MIR VReg Renaming Utilities -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MIRVRegNamerUtils.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mir-vregnamer-utils"
+
+namespace {
+
+// TypedVReg and VRType are used to tell the renamer what to do at points in a
+// sequence of values to be renamed. A TypedVReg can either contain
+// an actual VReg, a FrameIndex, or it could just be a barrier for the next
+// candidate (side-effecting instruction). This tells the renamer to increment
+// to the next vreg name, or to skip modulo some skip-gap value.
+enum VRType { RSE_Reg = 0, RSE_FrameIndex, RSE_NewCandidate };
+class TypedVReg {
+  VRType Type;
+  Register Reg;
+
+public:
+  TypedVReg(Register Reg) : Type(RSE_Reg), Reg(Reg) {}
+  TypedVReg(VRType Type) : Type(Type), Reg(~0U) {
+    assert(Type != RSE_Reg && "Expected a non-Register Type.");
+  }
+
+  bool isReg() const { return Type == RSE_Reg; }
+  bool isFrameIndex() const { return Type == RSE_FrameIndex; }
+  bool isCandidate() const { return Type == RSE_NewCandidate; }
+
+  VRType getType() const { return Type; }
+  Register getReg() const {
+    assert(this->isReg() && "Expected a virtual or physical Register.");
+    return Reg;
+  }
+};
+
+/// Here we find our candidates. What makes an interesting candidate?
+/// A candidate for a canonicalization tree root is normally any kind of
+/// instruction that causes side effects such as a store to memory or a copy to
+/// a physical register or a return instruction. We use these as an expression
+/// tree root that we walk in order to build a canonical walk which should
+/// result in canonical vreg renaming.
+std::vector<MachineInstr *> populateCandidates(MachineBasicBlock *MBB) {
+  std::vector<MachineInstr *> Candidates;
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+  for (auto II = MBB->begin(), IE = MBB->end(); II != IE; ++II) {
+    MachineInstr *MI = &*II;
+
+    bool DoesMISideEffect = false;
+
+    if (MI->getNumOperands() > 0 && MI->getOperand(0).isReg()) {
+      const Register Dst = MI->getOperand(0).getReg();
+      DoesMISideEffect |= !Register::isVirtualRegister(Dst);
+
+      for (auto UI = MRI.use_begin(Dst); UI != MRI.use_end(); ++UI) {
+        if (DoesMISideEffect)
+          break;
+        DoesMISideEffect |= (UI->getParent()->getParent() != MI->getParent());
+      }
+    }
+
+    if (!MI->mayStore() && !MI->isBranch() && !DoesMISideEffect)
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Found Candidate:  "; MI->dump(););
+    Candidates.push_back(MI);
+  }
+
+  return Candidates;
+}
+
+void doCandidateWalk(std::vector<TypedVReg> &VRegs,
+                     std::queue<TypedVReg> &RegQueue,
+                     std::vector<MachineInstr *> &VisitedMIs,
+                     const MachineBasicBlock *MBB) {
+
+  const MachineFunction &MF = *MBB->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  while (!RegQueue.empty()) {
+
+    auto TReg = RegQueue.front();
+    RegQueue.pop();
+
+    if (TReg.isFrameIndex()) {
+      LLVM_DEBUG(dbgs() << "Popping frame index.\n";);
+      VRegs.push_back(TypedVReg(RSE_FrameIndex));
+      continue;
+    }
+
+    assert(TReg.isReg() && "Expected vreg or physreg.");
+    Register Reg = TReg.getReg();
+
+    if (Register::isVirtualRegister(Reg)) {
+      LLVM_DEBUG({
+        dbgs() << "Popping vreg ";
+        MRI.def_begin(Reg)->dump();
+        dbgs() << "\n";
+      });
+
+      if (!llvm::any_of(VRegs, [&](const TypedVReg &TR) {
+            return TR.isReg() && TR.getReg() == Reg;
+          })) {
+        VRegs.push_back(TypedVReg(Reg));
+      }
+    } else {
+      LLVM_DEBUG(dbgs() << "Popping physreg.\n";);
+      VRegs.push_back(TypedVReg(Reg));
+      continue;
+    }
+
+    for (auto RI = MRI.def_begin(Reg), RE = MRI.def_end(); RI != RE; ++RI) {
+      MachineInstr *Def = RI->getParent();
+
+      if (Def->getParent() != MBB)
+        continue;
+
+      if (llvm::any_of(VisitedMIs,
+                       [&](const MachineInstr *VMI) { return Def == VMI; })) {
+        break;
+      }
+
+      LLVM_DEBUG({
+        dbgs() << "\n========================\n";
+        dbgs() << "Visited MI: ";
+        Def->dump();
+        dbgs() << "BB Name: " << Def->getParent()->getName() << "\n";
+        dbgs() << "\n========================\n";
+      });
+      VisitedMIs.push_back(Def);
+      for (unsigned I = 1, E = Def->getNumOperands(); I != E; ++I) {
+
+        MachineOperand &MO = Def->getOperand(I);
+        if (MO.isFI()) {
+          LLVM_DEBUG(dbgs() << "Pushing frame index.\n";);
+          RegQueue.push(TypedVReg(RSE_FrameIndex));
+        }
+
+        if (!MO.isReg())
+          continue;
+        RegQueue.push(TypedVReg(MO.getReg()));
+      }
+    }
+  }
+}
+
+std::map<unsigned, unsigned>
+getVRegRenameMap(const std::vector<TypedVReg> &VRegs,
+                 const std::vector<Register> &renamedInOtherBB,
+                 MachineRegisterInfo &MRI, NamedVRegCursor &NVC) {
+  std::map<unsigned, unsigned> VRegRenameMap;
+  bool FirstCandidate = true;
+
+  for (auto &vreg : VRegs) {
+    if (vreg.isFrameIndex()) {
+      // We skip one vreg for any frame index because there is a good chance
+      // (especially when comparing SelectionDAG to GlobalISel generated MIR)
+      // that in the other file we are just getting an incoming vreg that comes
+      // from a copy from a frame index. So it's safe to skip by one.
+      unsigned LastRenameReg = NVC.incrementVirtualVReg();
+      (void)LastRenameReg;
+      LLVM_DEBUG(dbgs() << "Skipping rename for FI " << LastRenameReg << "\n";);
+      continue;
+    } else if (vreg.isCandidate()) {
+
+      // After the first candidate, for every subsequent candidate, we skip mod
+      // 10 registers so that the candidates are more likely to start at the
+      // same vreg number making it more likely that the canonical walk from the
+      // candidate insruction. We don't need to skip from the first candidate of
+      // the BasicBlock because we already skip ahead several vregs for each BB.
+      unsigned LastRenameReg = NVC.getVirtualVReg();
+      if (FirstCandidate)
+        NVC.incrementVirtualVReg(LastRenameReg % 10);
+      FirstCandidate = false;
+      continue;
+    } else if (!Register::isVirtualRegister(vreg.getReg())) {
+      unsigned LastRenameReg = NVC.incrementVirtualVReg();
+      (void)LastRenameReg;
+      LLVM_DEBUG({
+        dbgs() << "Skipping rename for Phys Reg " << LastRenameReg << "\n";
+      });
+      continue;
+    }
+
+    auto Reg = vreg.getReg();
+    if (llvm::find(renamedInOtherBB, Reg) != renamedInOtherBB.end()) {
+      LLVM_DEBUG(dbgs() << "Vreg " << Reg
+                        << " already renamed in other BB.\n";);
+      continue;
+    }
+
+    auto Rename = NVC.createVirtualRegister(Reg);
+
+    if (VRegRenameMap.find(Reg) == VRegRenameMap.end()) {
+      LLVM_DEBUG(dbgs() << "Mapping vreg ";);
+      if (MRI.reg_begin(Reg) != MRI.reg_end()) {
+        LLVM_DEBUG(auto foo = &*MRI.reg_begin(Reg); foo->dump(););
+      } else {
+        LLVM_DEBUG(dbgs() << Reg;);
+      }
+      LLVM_DEBUG(dbgs() << " to ";);
+      if (MRI.reg_begin(Rename) != MRI.reg_end()) {
+        LLVM_DEBUG(auto foo = &*MRI.reg_begin(Rename); foo->dump(););
+      } else {
+        LLVM_DEBUG(dbgs() << Rename;);
+      }
+      LLVM_DEBUG(dbgs() << "\n";);
+
+      VRegRenameMap.insert(std::pair<unsigned, unsigned>(Reg, Rename));
+    }
+  }
+
+  return VRegRenameMap;
+}
+
+bool doVRegRenaming(std::vector<Register> &renamedInOtherBB,
+                    const std::map<unsigned, unsigned> &VRegRenameMap,
+                    MachineRegisterInfo &MRI) {
+  bool Changed = false;
+  for (auto I = VRegRenameMap.begin(), E = VRegRenameMap.end(); I != E; ++I) {
+
+    auto VReg = I->first;
+    auto Rename = I->second;
+
+    renamedInOtherBB.push_back(Rename);
+
+    std::vector<MachineOperand *> RenameMOs;
+    for (auto &MO : MRI.reg_operands(VReg)) {
+      RenameMOs.push_back(&MO);
+    }
+
+    for (auto *MO : RenameMOs) {
+      Changed = true;
+      MO->setReg(Rename);
+
+      if (!MO->isDef())
+        MO->setIsKill(false);
+    }
+  }
+
+  return Changed;
+}
+
+bool renameVRegs(MachineBasicBlock *MBB,
+                 std::vector<Register> &renamedInOtherBB,
+                 NamedVRegCursor &NVC) {
+  bool Changed = false;
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  std::vector<MachineInstr *> Candidates = populateCandidates(MBB);
+  std::vector<MachineInstr *> VisitedMIs;
+  llvm::copy(Candidates, std::back_inserter(VisitedMIs));
+
+  std::vector<TypedVReg> VRegs;
+  for (auto candidate : Candidates) {
+    VRegs.push_back(TypedVReg(RSE_NewCandidate));
+
+    std::queue<TypedVReg> RegQueue;
+
+    // Here we walk the vreg operands of a non-root node along our walk.
+    // The root nodes are the original candidates (stores normally).
+    // These are normally not the root nodes (except for the case of copies to
+    // physical registers).
+    for (unsigned i = 1; i < candidate->getNumOperands(); i++) {
+      if (candidate->mayStore() || candidate->isBranch())
+        break;
+
+      MachineOperand &MO = candidate->getOperand(i);
+      if (!(MO.isReg() && Register::isVirtualRegister(MO.getReg())))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Enqueue register"; MO.dump(); dbgs() << "\n";);
+      RegQueue.push(TypedVReg(MO.getReg()));
+    }
+
+    // Here we walk the root candidates. We start from the 0th operand because
+    // the root is normally a store to a vreg.
+    for (unsigned i = 0; i < candidate->getNumOperands(); i++) {
+
+      if (!candidate->mayStore() && !candidate->isBranch())
+        break;
+
+      MachineOperand &MO = candidate->getOperand(i);
+
+      // TODO: Do we want to only add vregs here?
+      if (!MO.isReg() && !MO.isFI())
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Enqueue Reg/FI"; MO.dump(); dbgs() << "\n";);
+
+      RegQueue.push(MO.isReg() ? TypedVReg(MO.getReg())
+                               : TypedVReg(RSE_FrameIndex));
+    }
+
+    doCandidateWalk(VRegs, RegQueue, VisitedMIs, MBB);
+  }
+
+  // If we have populated no vregs to rename then bail.
+  // The rest of this function does the vreg remaping.
+  if (VRegs.size() == 0)
+    return Changed;
+
+  auto VRegRenameMap = getVRegRenameMap(VRegs, renamedInOtherBB, MRI, NVC);
+  Changed |= doVRegRenaming(renamedInOtherBB, VRegRenameMap, MRI);
+  return Changed;
+}
+} // anonymous namespace
+
+void NamedVRegCursor::skipVRegs() {
+  unsigned VRegGapIndex = 1;
+  if (!virtualVRegNumber) {
+    VRegGapIndex = 0;
+    virtualVRegNumber = MRI.createIncompleteVirtualRegister();
+  }
+  const unsigned VR_GAP = (++VRegGapIndex * SkipGapSize);
+
+  unsigned I = virtualVRegNumber;
+  const unsigned E = (((I + VR_GAP) / VR_GAP) + 1) * VR_GAP;
+
+  virtualVRegNumber = E;
+}
+
+unsigned NamedVRegCursor::createVirtualRegister(unsigned VReg) {
+  if (!virtualVRegNumber)
+    skipVRegs();
+  std::string S;
+  raw_string_ostream OS(S);
+  OS << "namedVReg" << (virtualVRegNumber & ~0x80000000);
+  OS.flush();
+  virtualVRegNumber++;
+  if (auto RC = MRI.getRegClassOrNull(VReg))
+    return MRI.createVirtualRegister(RC, OS.str());
+  return MRI.createGenericVirtualRegister(MRI.getType(VReg), OS.str());
+}
+
+bool NamedVRegCursor::renameVRegs(MachineBasicBlock *MBB) {
+  return ::renameVRegs(MBB, RenamedInOtherBB, *this);
+}
diff --git a/lib/CodeGen/MIRVRegNamerUtils.h b/lib/CodeGen/MIRVRegNamerUtils.h
new file mode 100644
index 000000000000..c5b52a968538
--- /dev/null
+++ b/lib/CodeGen/MIRVRegNamerUtils.h
@@ -0,0 +1,91 @@
+
+//===------------ MIRVRegNamerUtils.h - MIR VReg Renaming Utilities -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The purpose of these utilities is to abstract out parts of the MIRCanon pass
+// that are responsible for renaming virtual registers with the purpose of
+// sharing code with a MIRVRegNamer pass that could be the analog of the
+// opt -instnamer pass.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_MIRVREGNAMERUTILS_H
+#define LLVM_LIB_CODEGEN_MIRVREGNAMERUTILS_H
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <queue>
+
+namespace llvm {
+
+/// NamedVRegCursor - The cursor is an object that keeps track of what the next
+/// vreg name should be. It does book keeping to determine when to skip the
+/// index value and by how much, or if the next vreg name should be an increment
+/// from the previous.
+class NamedVRegCursor {
+  MachineRegisterInfo &MRI;
+
+  /// virtualVRegNumber - Book keeping of the last vreg position.
+  unsigned virtualVRegNumber;
+
+  /// SkipGapSize - Used to calculate a modulo amount to skip by after every
+  /// sequence of instructions starting from a given side-effecting
+  /// MachineInstruction for a given MachineBasicBlock. The general idea is that
+  /// for a given program compiled with two different opt pipelines, there
+  /// shouldn't be greater than SkipGapSize difference in how many vregs are in
+  /// play between the two and for every def-use graph of vregs we rename we
+  /// will round up to the next SkipGapSize'th number so that we have a high
+  /// change of landing on the same name for two given matching side-effects
+  /// for the two compilation outcomes.
+  const unsigned SkipGapSize;
+
+  /// RenamedInOtherBB - VRegs that we already renamed: ie breadcrumbs.
+  std::vector<Register> RenamedInOtherBB;
+
+public:
+  NamedVRegCursor() = delete;
+  /// 1000 for the SkipGapSize was a good heuristic at the time of the writing
+  /// of the MIRCanonicalizerPass. Adjust as needed.
+  NamedVRegCursor(MachineRegisterInfo &MRI, unsigned SkipGapSize = 1000)
+      : MRI(MRI), virtualVRegNumber(0), SkipGapSize(SkipGapSize) {}
+
+  /// SkipGapSize - Skips modulo a gap value of indices. Indices are used to
+  /// produce the next vreg name.
+  void skipVRegs();
+
+  unsigned getVirtualVReg() const { return virtualVRegNumber; }
+
+  /// incrementVirtualVReg - This increments an index value that us used to
+  /// create a new vreg name. This is not a Register.
+  unsigned incrementVirtualVReg(unsigned incr = 1) {
+    virtualVRegNumber += incr;
+    return virtualVRegNumber;
+  }
+
+  /// createVirtualRegister - Given an existing vreg, create a named vreg to
+  /// take its place.
+  unsigned createVirtualRegister(unsigned VReg);
+
+  /// renameVRegs - For a given MachineBasicBlock, scan for side-effecting
+  /// instructions, walk the def-use from each side-effecting root (in sorted
+  /// root order) and rename the encountered vregs in the def-use graph in a
+  /// canonical ordering. This method maintains book keeping for which vregs
+  /// were already renamed in RenamedInOtherBB.
+  // @return changed
+  bool renameVRegs(MachineBasicBlock *MBB);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 4d29e883d879..854bef3aab05 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -39,6 +39,12 @@ using namespace llvm;
 
 #define DEBUG_TYPE "codegen"
 
+static cl::opt<bool> PrintSlotIndexes(
+    "print-slotindexes",
+    cl::desc("When printing machine IR, annotate instructions and blocks with "
+             "SlotIndexes when available"),
+    cl::init(true), cl::Hidden);
+
 MachineBasicBlock::MachineBasicBlock(MachineFunction &MF, const BasicBlock *B)
     : BB(B), Number(-1), xParent(&MF) {
   Insts.Parent = this;
@@ -291,7 +297,7 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
     return;
   }
 
-  if (Indexes)
+  if (Indexes && PrintSlotIndexes)
     OS << Indexes->getMBBStartIdx(this) << '\t';
 
   OS << "bb." << getNumber();
@@ -320,9 +326,9 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
     OS << "landing-pad";
     HasAttributes = true;
   }
-  if (getAlignment()) {
+  if (getAlignment() != Align::None()) {
     OS << (HasAttributes ? ", " : " (");
-    OS << "align " << getAlignment();
+    OS << "align " << Log2(getAlignment());
     HasAttributes = true;
   }
   if (HasAttributes)
@@ -402,7 +408,7 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
   bool IsInBundle = false;
   for (const MachineInstr &MI : instrs()) {
-    if (Indexes) {
+    if (Indexes && PrintSlotIndexes) {
       if (Indexes->hasIndex(MI))
         OS << Indexes->getInstructionIndex(MI);
       OS << '\t';
@@ -484,9 +490,9 @@ void MachineBasicBlock::sortUniqueLiveIns() {
 }
 
 unsigned
-MachineBasicBlock::addLiveIn(MCPhysReg PhysReg, const TargetRegisterClass *RC) {
+MachineBasicBlock::addLiveIn(MCRegister PhysReg, const TargetRegisterClass *RC) {
   assert(getParent() && "MBB must be inserted in function");
-  assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) && "Expected physreg");
+  assert(PhysReg.isPhysical() && "Expected physreg");
   assert(RC && "Register class is required");
   assert((isEHPad() || this == &getParent()->front()) &&
          "Only the entry block and landing pads can have physreg live ins");
@@ -500,14 +506,14 @@ MachineBasicBlock::addLiveIn(MCPhysReg PhysReg, const TargetRegisterClass *RC) {
   if (LiveIn)
     for (;I != E && I->isCopy(); ++I)
       if (I->getOperand(1).getReg() == PhysReg) {
-        unsigned VirtReg = I->getOperand(0).getReg();
+        Register VirtReg = I->getOperand(0).getReg();
         if (!MRI.constrainRegClass(VirtReg, RC))
           llvm_unreachable("Incompatible live-in register class.");
         return VirtReg;
       }
 
   // No luck, create a virtual register.
-  unsigned VirtReg = MRI.createVirtualRegister(RC);
+  Register VirtReg = MRI.createVirtualRegister(RC);
   BuildMI(*this, I, DebugLoc(), TII.get(TargetOpcode::COPY), VirtReg)
     .addReg(PhysReg, RegState::Kill);
   if (!LiveIn)
@@ -772,7 +778,8 @@ void MachineBasicBlock::transferSuccessors(MachineBasicBlock *FromMBB) {
   while (!FromMBB->succ_empty()) {
     MachineBasicBlock *Succ = *FromMBB->succ_begin();
 
-    // If probability list is empty it means we don't use it (disabled optimization).
+    // If probability list is empty it means we don't use it (disabled
+    // optimization).
     if (!FromMBB->Probs.empty()) {
       auto Prob = *FromMBB->Probs.begin();
       addSuccessor(Succ, Prob);
@@ -798,13 +805,7 @@ MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB) {
     FromMBB->removeSuccessor(Succ);
 
     // Fix up any PHI nodes in the successor.
-    for (MachineBasicBlock::instr_iterator MI = Succ->instr_begin(),
-           ME = Succ->instr_end(); MI != ME && MI->isPHI(); ++MI)
-      for (unsigned i = 2, e = MI->getNumOperands()+1; i != e; i += 2) {
-        MachineOperand &MO = MI->getOperand(i);
-        if (MO.getMBB() == FromMBB)
-          MO.setMBB(this);
-      }
+    Succ->replacePhiUsesWith(FromMBB, this);
   }
   normalizeSuccProbs();
 }
@@ -907,8 +908,8 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
         if (!OI->isReg() || OI->getReg() == 0 ||
             !OI->isUse() || !OI->isKill() || OI->isUndef())
           continue;
-        unsigned Reg = OI->getReg();
-        if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+        Register Reg = OI->getReg();
+        if (Register::isPhysicalRegister(Reg) ||
             LV->getVarInfo(Reg).removeKill(*MI)) {
           KilledRegs.push_back(Reg);
           LLVM_DEBUG(dbgs() << "Removing terminator kill: " << *MI);
@@ -928,7 +929,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
         if (!OI->isReg() || OI->getReg() == 0)
           continue;
 
-        unsigned Reg = OI->getReg();
+        Register Reg = OI->getReg();
         if (!is_contained(UsedRegs, Reg))
           UsedRegs.push_back(Reg);
       }
@@ -979,13 +980,8 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
     }
   }
 
-  // Fix PHI nodes in Succ so they refer to NMBB instead of this
-  for (MachineBasicBlock::instr_iterator
-         i = Succ->instr_begin(),e = Succ->instr_end();
-       i != e && i->isPHI(); ++i)
-    for (unsigned ni = 1, ne = i->getNumOperands(); ni != ne; ni += 2)
-      if (i->getOperand(ni+1).getMBB() == this)
-        i->getOperand(ni+1).setMBB(NMBB);
+  // Fix PHI nodes in Succ so they refer to NMBB instead of this.
+  Succ->replacePhiUsesWith(this, NMBB);
 
   // Inherit live-ins from the successor
   for (const auto &LI : Succ->liveins())
@@ -1000,7 +996,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
       for (instr_iterator I = instr_end(), E = instr_begin(); I != E;) {
         if (!(--I)->addRegisterKilled(Reg, TRI, /* AddIfNotFound= */ false))
           continue;
-        if (TargetRegisterInfo::isVirtualRegister(Reg))
+        if (Register::isVirtualRegister(Reg))
           LV->getVarInfo(Reg).Kills.push_back(&*I);
         LLVM_DEBUG(dbgs() << "Restored terminator kill: " << *I);
         break;
@@ -1033,7 +1029,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
       for (unsigned ni = 1, ne = I->getNumOperands(); ni != ne; ni += 2) {
         if (I->getOperand(ni+1).getMBB() == NMBB) {
           MachineOperand &MO = I->getOperand(ni);
-          unsigned Reg = MO.getReg();
+          Register Reg = MO.getReg();
           PHISrcRegs.insert(Reg);
           if (MO.isUndef())
             continue;
@@ -1049,7 +1045,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
 
     MachineRegisterInfo *MRI = &getParent()->getRegInfo();
     for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-      unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+      unsigned Reg = Register::index2VirtReg(i);
       if (PHISrcRegs.count(Reg) || !LIS->hasInterval(Reg))
         continue;
 
@@ -1217,6 +1213,16 @@ void MachineBasicBlock::ReplaceUsesOfBlockWith(MachineBasicBlock *Old,
   replaceSuccessor(Old, New);
 }
 
+void MachineBasicBlock::replacePhiUsesWith(MachineBasicBlock *Old,
+                                           MachineBasicBlock *New) {
+  for (MachineInstr &MI : phis())
+    for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (MO.getMBB() == Old)
+        MO.setMBB(New);
+    }
+}
+
 /// Various pieces of code can cause excess edges in the CFG to be inserted.  If
 /// we have proven that MBB can only branch to DestA and DestB, remove any other
 /// MBB successors from the CFG.  DestA and DestB can be null.
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 639b588766a1..ac19bc0bd8ea 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -79,16 +79,17 @@ STATISTIC(CondBranchTakenFreq,
 STATISTIC(UncondBranchTakenFreq,
           "Potential frequency of taking unconditional branches");
 
-static cl::opt<unsigned> AlignAllBlock("align-all-blocks",
-                                       cl::desc("Force the alignment of all "
-                                                "blocks in the function."),
-                                       cl::init(0), cl::Hidden);
+static cl::opt<unsigned> AlignAllBlock(
+    "align-all-blocks",
+    cl::desc("Force the alignment of all blocks in the function in log2 format "
+             "(e.g 4 means align on 16B boundaries)."),
+    cl::init(0), cl::Hidden);
 
 static cl::opt<unsigned> AlignAllNonFallThruBlocks(
     "align-all-nofallthru-blocks",
-    cl::desc("Force the alignment of all "
-             "blocks that have no fall-through predecessors (i.e. don't add "
-             "nops that are executed)."),
+    cl::desc("Force the alignment of all blocks that have no fall-through "
+             "predecessors (i.e. don't add nops that are executed). In log2 "
+             "format (e.g 4 means align on 16B boundaries)."),
     cl::init(0), cl::Hidden);
 
 // FIXME: Find a good default for this flag and remove the flag.
@@ -2763,8 +2764,8 @@ void MachineBlockPlacement::alignBlocks() {
     if (!L)
       continue;
 
-    unsigned Align = TLI->getPrefLoopAlignment(L);
-    if (!Align)
+    const Align Align = TLI->getPrefLoopAlignment(L);
+    if (Align == 1)
       continue; // Don't care about loop alignment.
 
     // If the block is cold relative to the function entry don't waste space
@@ -2981,7 +2982,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
 
   F = &MF;
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
-  MBFI = llvm::make_unique<BranchFolder::MBFIWrapper>(
+  MBFI = std::make_unique<BranchFolder::MBFIWrapper>(
       getAnalysis<MachineBlockFrequencyInfo>());
   MLI = &getAnalysis<MachineLoopInfo>();
   TII = MF.getSubtarget().getInstrInfo();
@@ -3038,8 +3039,9 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
     BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
                     *MBPI, TailMergeSize);
 
+    auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
     if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
-                            getAnalysisIfAvailable<MachineModuleInfo>(), MLI,
+                            MMIWP ? &MMIWP->getMMI() : nullptr, MLI,
                             /*AfterPlacement=*/true)) {
       // Redo the layout if tail merging creates/removes/moves blocks.
       BlockToChain.clear();
@@ -3062,14 +3064,14 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   if (AlignAllBlock)
     // Align all of the blocks in the function to a specific alignment.
     for (MachineBasicBlock &MBB : MF)
-      MBB.setAlignment(AlignAllBlock);
+      MBB.setAlignment(Align(1ULL << AlignAllBlock));
   else if (AlignAllNonFallThruBlocks) {
     // Align all of the blocks that have no fall-through predecessors to a
     // specific alignment.
     for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
       auto LayoutPred = std::prev(MBI);
       if (!LayoutPred->isSuccessor(&*MBI))
-        MBI->setAlignment(AlignAllNonFallThruBlocks);
+        MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
     }
   }
   if (ViewBlockLayoutWithBFI != GVDT_None &&
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index 2df6d40d9293..d9bd32b2fbab 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -66,6 +67,7 @@ namespace {
     AliasAnalysis *AA;
     MachineDominatorTree *DT;
     MachineRegisterInfo *MRI;
+    MachineBlockFrequencyInfo *MBFI;
 
   public:
     static char ID; // Pass identification
@@ -83,6 +85,8 @@ namespace {
       AU.addPreservedID(MachineLoopInfoID);
       AU.addRequired<MachineDominatorTree>();
       AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineBlockFrequencyInfo>();
+      AU.addPreserved<MachineBlockFrequencyInfo>();
     }
 
     void releaseMemory() override {
@@ -133,6 +137,11 @@ namespace {
     bool isPRECandidate(MachineInstr *MI);
     bool ProcessBlockPRE(MachineDominatorTree *MDT, MachineBasicBlock *MBB);
     bool PerformSimplePRE(MachineDominatorTree *DT);
+    /// Heuristics to see if it's profitable to move common computations of MBB
+    /// and MBB1 to CandidateBB.
+    bool isProfitableToHoistInto(MachineBasicBlock *CandidateBB,
+                                 MachineBasicBlock *MBB,
+                                 MachineBasicBlock *MBB1);
   };
 
 } // end anonymous namespace
@@ -158,15 +167,15 @@ bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI,
   for (MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || !MO.isUse())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
       continue;
     bool OnlyOneUse = MRI->hasOneNonDBGUse(Reg);
     MachineInstr *DefMI = MRI->getVRegDef(Reg);
     if (!DefMI->isCopy())
       continue;
-    unsigned SrcReg = DefMI->getOperand(1).getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+    Register SrcReg = DefMI->getOperand(1).getReg();
+    if (!Register::isVirtualRegister(SrcReg))
       continue;
     if (DefMI->getOperand(0).getSubReg())
       continue;
@@ -189,14 +198,16 @@ bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI,
     LLVM_DEBUG(dbgs() << "Coalescing: " << *DefMI);
     LLVM_DEBUG(dbgs() << "***     to: " << *MI);
 
-    // Update matching debug values.
-    DefMI->changeDebugValuesDefReg(SrcReg);
-
     // Propagate SrcReg of copies to MI.
     MO.setReg(SrcReg);
     MRI->clearKillFlags(SrcReg);
     // Coalesce single use copies.
     if (OnlyOneUse) {
+      // If (and only if) we've eliminated all uses of the copy, also
+      // copy-propagate to any debug-users of MI, or they'll be left using
+      // an undefined value.
+      DefMI->changeDebugValuesDefReg(SrcReg);
+
       DefMI->eraseFromParent();
       ++NumCoalesces;
     }
@@ -271,10 +282,10 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
   for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || MO.isDef())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
-    if (TargetRegisterInfo::isVirtualRegister(Reg))
+    if (Register::isVirtualRegister(Reg))
       continue;
     // Reading either caller preserved or constant physregs is ok.
     if (!isCallerPreservedOrConstPhysReg(Reg, *MI->getMF(), *TRI))
@@ -290,10 +301,10 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
     const MachineOperand &MO = MOP.value();
     if (!MO.isReg() || !MO.isDef())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
-    if (TargetRegisterInfo::isVirtualRegister(Reg))
+    if (Register::isVirtualRegister(Reg))
       continue;
     // Check against PhysRefs even if the def is "dead".
     if (PhysRefs.count(Reg))
@@ -367,8 +378,8 @@ bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
         return false;
       if (!MO.isReg() || !MO.isDef())
         continue;
-      unsigned MOReg = MO.getReg();
-      if (TargetRegisterInfo::isVirtualRegister(MOReg))
+      Register MOReg = MO.getReg();
+      if (Register::isVirtualRegister(MOReg))
         continue;
       if (PhysRefs.count(MOReg))
         return false;
@@ -424,8 +435,7 @@ bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
   // If CSReg is used at all uses of Reg, CSE should not increase register
   // pressure of CSReg.
   bool MayIncreasePressure = true;
-  if (TargetRegisterInfo::isVirtualRegister(CSReg) &&
-      TargetRegisterInfo::isVirtualRegister(Reg)) {
+  if (Register::isVirtualRegister(CSReg) && Register::isVirtualRegister(Reg)) {
     MayIncreasePressure = false;
     SmallPtrSet<MachineInstr*, 8> CSUses;
     for (MachineInstr &MI : MRI->use_nodbg_instructions(CSReg)) {
@@ -453,8 +463,7 @@ bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
   // of the redundant computation are copies, do not cse.
   bool HasVRegUse = false;
   for (const MachineOperand &MO : MI->operands()) {
-    if (MO.isReg() && MO.isUse() &&
-        TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+    if (MO.isReg() && MO.isUse() && Register::isVirtualRegister(MO.getReg())) {
       HasVRegUse = true;
       break;
     }
@@ -586,8 +595,8 @@ bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
       MachineOperand &MO = MI->getOperand(i);
       if (!MO.isReg() || !MO.isDef())
         continue;
-      unsigned OldReg = MO.getReg();
-      unsigned NewReg = CSMI->getOperand(i).getReg();
+      Register OldReg = MO.getReg();
+      Register NewReg = CSMI->getOperand(i).getReg();
 
       // Go through implicit defs of CSMI and MI, if a def is not dead at MI,
       // we should make sure it is not dead at CSMI.
@@ -604,8 +613,8 @@ bool MachineCSE::ProcessBlockCSE(MachineBasicBlock *MBB) {
         continue;
       }
 
-      assert(TargetRegisterInfo::isVirtualRegister(OldReg) &&
-             TargetRegisterInfo::isVirtualRegister(NewReg) &&
+      assert(Register::isVirtualRegister(OldReg) &&
+             Register::isVirtualRegister(NewReg) &&
              "Do not CSE physical register defs!");
 
       if (!isProfitableToCSE(NewReg, OldReg, CSMI->getParent(), MI)) {
@@ -769,11 +778,11 @@ bool MachineCSE::isPRECandidate(MachineInstr *MI) {
     return false;
 
   for (auto def : MI->defs())
-    if (!TRI->isVirtualRegister(def.getReg()))
+    if (!Register::isVirtualRegister(def.getReg()))
       return false;
 
   for (auto use : MI->uses())
-    if (use.isReg() && !TRI->isVirtualRegister(use.getReg()))
+    if (use.isReg() && !Register::isVirtualRegister(use.getReg()))
       return false;
 
   return true;
@@ -802,6 +811,9 @@ bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT,
     if (!CMBB->isLegalToHoistInto())
       continue;
 
+    if (!isProfitableToHoistInto(CMBB, MBB, MBB1))
+      continue;
+
     // Two instrs are partial redundant if their basic blocks are reachable
     // from one to another but one doesn't dominate another.
     if (CMBB != MBB1) {
@@ -812,8 +824,8 @@ bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT,
 
         assert(MI->getOperand(0).isDef() &&
                "First operand of instr with one explicit def must be this def");
-        unsigned VReg = MI->getOperand(0).getReg();
-        unsigned NewReg = MRI->cloneVirtualRegister(VReg);
+        Register VReg = MI->getOperand(0).getReg();
+        Register NewReg = MRI->cloneVirtualRegister(VReg);
         if (!isProfitableToCSE(NewReg, VReg, CMBB, MI))
           continue;
         MachineInstr &NewMI =
@@ -854,6 +866,18 @@ bool MachineCSE::PerformSimplePRE(MachineDominatorTree *DT) {
   return Changed;
 }
 
+bool MachineCSE::isProfitableToHoistInto(MachineBasicBlock *CandidateBB,
+                                         MachineBasicBlock *MBB,
+                                         MachineBasicBlock *MBB1) {
+  if (CandidateBB->getParent()->getFunction().hasMinSize())
+    return true;
+  assert(DT->dominates(CandidateBB, MBB) && "CandidateBB should dominate MBB");
+  assert(DT->dominates(CandidateBB, MBB1) &&
+         "CandidateBB should dominate MBB1");
+  return MBFI->getBlockFreq(CandidateBB) <=
+         MBFI->getBlockFreq(MBB) + MBFI->getBlockFreq(MBB1);
+}
+
 bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -863,6 +887,7 @@ bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   DT = &getAnalysis<MachineDominatorTree>();
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   LookAheadLimit = TII->getMachineCSELookAheadLimit();
   bool ChangedPRE, ChangedCSE;
   ChangedPRE = PerformSimplePRE(DT);
diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp
index 0584ec0bd2b3..e9f462fd1b37 100644
--- a/lib/CodeGen/MachineCombiner.cpp
+++ b/lib/CodeGen/MachineCombiner.cpp
@@ -137,7 +137,7 @@ void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
 MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
   MachineInstr *DefInstr = nullptr;
   // We need a virtual register definition.
-  if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+  if (MO.isReg() && Register::isVirtualRegister(MO.getReg()))
     DefInstr = MRI->getUniqueVRegDef(MO.getReg());
   // PHI's have no depth etc.
   if (DefInstr && DefInstr->isPHI())
@@ -168,7 +168,7 @@ MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
     unsigned IDepth = 0;
     for (const MachineOperand &MO : InstrPtr->operands()) {
       // Check for virtual register operand.
-      if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
+      if (!(MO.isReg() && Register::isVirtualRegister(MO.getReg())))
         continue;
       if (!MO.isUse())
         continue;
@@ -223,7 +223,7 @@ unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
 
   for (const MachineOperand &MO : NewRoot->operands()) {
     // Check for virtual register operand.
-    if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
+    if (!(MO.isReg() && Register::isVirtualRegister(MO.getReg())))
       continue;
     if (!MO.isDef())
       continue;
diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp
index 9fc12ac89e12..ebe76e31dca9 100644
--- a/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/lib/CodeGen/MachineCopyPropagation.cpp
@@ -119,8 +119,8 @@ public:
   void trackCopy(MachineInstr *MI, const TargetRegisterInfo &TRI) {
     assert(MI->isCopy() && "Tracking non-copy?");
 
-    unsigned Def = MI->getOperand(0).getReg();
-    unsigned Src = MI->getOperand(1).getReg();
+    Register Def = MI->getOperand(0).getReg();
+    Register Src = MI->getOperand(1).getReg();
 
     // Remember Def is defined by the copy.
     for (MCRegUnitIterator RUI(Def, &TRI); RUI.isValid(); ++RUI)
@@ -163,8 +163,8 @@ public:
 
     // Check that the available copy isn't clobbered by any regmasks between
     // itself and the destination.
-    unsigned AvailSrc = AvailCopy->getOperand(1).getReg();
-    unsigned AvailDef = AvailCopy->getOperand(0).getReg();
+    Register AvailSrc = AvailCopy->getOperand(1).getReg();
+    Register AvailDef = AvailCopy->getOperand(0).getReg();
     for (const MachineInstr &MI :
          make_range(AvailCopy->getIterator(), DestCopy.getIterator()))
       for (const MachineOperand &MO : MI.operands())
@@ -205,8 +205,11 @@ public:
   }
 
 private:
+  typedef enum { DebugUse = false, RegularUse = true } DebugType;
+
   void ClobberRegister(unsigned Reg);
-  void ReadRegister(unsigned Reg);
+  void ReadRegister(unsigned Reg, MachineInstr &Reader,
+                    DebugType DT);
   void CopyPropagateBlock(MachineBasicBlock &MBB);
   bool eraseIfRedundant(MachineInstr &Copy, unsigned Src, unsigned Def);
   void forwardUses(MachineInstr &MI);
@@ -217,6 +220,9 @@ private:
   /// Candidates for deletion.
   SmallSetVector<MachineInstr *, 8> MaybeDeadCopies;
 
+  /// Multimap tracking debug users in current BB
+  DenseMap<MachineInstr*, SmallVector<MachineInstr*, 2>> CopyDbgUsers;
+
   CopyTracker Tracker;
 
   bool Changed;
@@ -231,13 +237,19 @@ char &llvm::MachineCopyPropagationID = MachineCopyPropagation::ID;
 INITIALIZE_PASS(MachineCopyPropagation, DEBUG_TYPE,
                 "Machine Copy Propagation Pass", false, false)
 
-void MachineCopyPropagation::ReadRegister(unsigned Reg) {
+void MachineCopyPropagation::ReadRegister(unsigned Reg, MachineInstr &Reader,
+                                          DebugType DT) {
   // If 'Reg' is defined by a copy, the copy is no longer a candidate
-  // for elimination.
+  // for elimination. If a copy is "read" by a debug user, record the user
+  // for propagation.
   for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
     if (MachineInstr *Copy = Tracker.findCopyForUnit(*RUI, *TRI)) {
-      LLVM_DEBUG(dbgs() << "MCP: Copy is used - not dead: "; Copy->dump());
-      MaybeDeadCopies.remove(Copy);
+      if (DT == RegularUse) {
+        LLVM_DEBUG(dbgs() << "MCP: Copy is used - not dead: "; Copy->dump());
+        MaybeDeadCopies.remove(Copy);
+      } else {
+        CopyDbgUsers[Copy].push_back(&Reader);
+      }
     }
   }
 }
@@ -250,8 +262,8 @@ void MachineCopyPropagation::ReadRegister(unsigned Reg) {
 /// isNopCopy("ecx = COPY eax", AH, CL) == false
 static bool isNopCopy(const MachineInstr &PreviousCopy, unsigned Src,
                       unsigned Def, const TargetRegisterInfo *TRI) {
-  unsigned PreviousSrc = PreviousCopy.getOperand(1).getReg();
-  unsigned PreviousDef = PreviousCopy.getOperand(0).getReg();
+  Register PreviousSrc = PreviousCopy.getOperand(1).getReg();
+  Register PreviousDef = PreviousCopy.getOperand(0).getReg();
   if (Src == PreviousSrc) {
     assert(Def == PreviousDef);
     return true;
@@ -288,7 +300,7 @@ bool MachineCopyPropagation::eraseIfRedundant(MachineInstr &Copy, unsigned Src,
   // Copy was redundantly redefining either Src or Def. Remove earlier kill
   // flags between Copy and PrevCopy because the value will be reused now.
   assert(Copy.isCopy());
-  unsigned CopyDef = Copy.getOperand(0).getReg();
+  Register CopyDef = Copy.getOperand(0).getReg();
   assert(CopyDef == Src || CopyDef == Def);
   for (MachineInstr &MI :
        make_range(PrevCopy->getIterator(), Copy.getIterator()))
@@ -307,7 +319,7 @@ bool MachineCopyPropagation::isForwardableRegClassCopy(const MachineInstr &Copy,
                                                        const MachineInstr &UseI,
                                                        unsigned UseIdx) {
 
-  unsigned CopySrcReg = Copy.getOperand(1).getReg();
+  Register CopySrcReg = Copy.getOperand(1).getReg();
 
   // If the new register meets the opcode register constraints, then allow
   // forwarding.
@@ -398,9 +410,9 @@ void MachineCopyPropagation::forwardUses(MachineInstr &MI) {
     if (!Copy)
       continue;
 
-    unsigned CopyDstReg = Copy->getOperand(0).getReg();
+    Register CopyDstReg = Copy->getOperand(0).getReg();
     const MachineOperand &CopySrc = Copy->getOperand(1);
-    unsigned CopySrcReg = CopySrc.getReg();
+    Register CopySrcReg = CopySrc.getReg();
 
     // FIXME: Don't handle partial uses of wider COPYs yet.
     if (MOUse.getReg() != CopyDstReg) {
@@ -456,11 +468,11 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
     // Analyze copies (which don't overlap themselves).
     if (MI->isCopy() && !TRI->regsOverlap(MI->getOperand(0).getReg(),
                                           MI->getOperand(1).getReg())) {
-      unsigned Def = MI->getOperand(0).getReg();
-      unsigned Src = MI->getOperand(1).getReg();
+      Register Def = MI->getOperand(0).getReg();
+      Register Src = MI->getOperand(1).getReg();
 
-      assert(!TargetRegisterInfo::isVirtualRegister(Def) &&
-             !TargetRegisterInfo::isVirtualRegister(Src) &&
+      assert(!Register::isVirtualRegister(Def) &&
+             !Register::isVirtualRegister(Src) &&
              "MachineCopyPropagation should be run after register allocation!");
 
       // The two copies cancel out and the source of the first copy
@@ -488,14 +500,14 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
 
       // If Src is defined by a previous copy, the previous copy cannot be
       // eliminated.
-      ReadRegister(Src);
+      ReadRegister(Src, *MI, RegularUse);
       for (const MachineOperand &MO : MI->implicit_operands()) {
         if (!MO.isReg() || !MO.readsReg())
           continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (!Reg)
           continue;
-        ReadRegister(Reg);
+        ReadRegister(Reg, *MI, RegularUse);
       }
 
       LLVM_DEBUG(dbgs() << "MCP: Copy is a deletion candidate: "; MI->dump());
@@ -515,7 +527,7 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
       for (const MachineOperand &MO : MI->implicit_operands()) {
         if (!MO.isReg() || !MO.isDef())
           continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (!Reg)
           continue;
         Tracker.clobberRegister(Reg, *TRI);
@@ -529,12 +541,12 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
     // Clobber any earlyclobber regs first.
     for (const MachineOperand &MO : MI->operands())
       if (MO.isReg() && MO.isEarlyClobber()) {
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         // If we have a tied earlyclobber, that means it is also read by this
         // instruction, so we need to make sure we don't remove it as dead
         // later.
         if (MO.isTied())
-          ReadRegister(Reg);
+          ReadRegister(Reg, *MI, RegularUse);
         Tracker.clobberRegister(Reg, *TRI);
       }
 
@@ -548,18 +560,18 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
         RegMask = &MO;
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg)
         continue;
 
-      assert(!TargetRegisterInfo::isVirtualRegister(Reg) &&
+      assert(!Register::isVirtualRegister(Reg) &&
              "MachineCopyPropagation should be run after register allocation!");
 
       if (MO.isDef() && !MO.isEarlyClobber()) {
         Defs.push_back(Reg);
         continue;
-      } else if (!MO.isDebug() && MO.readsReg())
-        ReadRegister(Reg);
+      } else if (MO.readsReg())
+        ReadRegister(Reg, *MI, MO.isDebug() ? DebugUse : RegularUse);
     }
 
     // The instruction has a register mask operand which means that it clobbers
@@ -571,7 +583,7 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
                MaybeDeadCopies.begin();
            DI != MaybeDeadCopies.end();) {
         MachineInstr *MaybeDead = *DI;
-        unsigned Reg = MaybeDead->getOperand(0).getReg();
+        Register Reg = MaybeDead->getOperand(0).getReg();
         assert(!MRI->isReserved(Reg));
 
         if (!RegMask->clobbersPhysReg(Reg)) {
@@ -609,9 +621,10 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
                  MaybeDead->dump());
       assert(!MRI->isReserved(MaybeDead->getOperand(0).getReg()));
 
-      // Update matching debug values.
+      // Update matching debug values, if any.
       assert(MaybeDead->isCopy());
-      MaybeDead->changeDebugValuesDefReg(MaybeDead->getOperand(1).getReg());
+      unsigned SrcReg = MaybeDead->getOperand(1).getReg();
+      MRI->updateDbgUsersToReg(SrcReg, CopyDbgUsers[MaybeDead]);
 
       MaybeDead->eraseFromParent();
       Changed = true;
@@ -620,6 +633,7 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
   }
 
   MaybeDeadCopies.clear();
+  CopyDbgUsers.clear();
   Tracker.clear();
 }
 
diff --git a/lib/CodeGen/MachineDominators.cpp b/lib/CodeGen/MachineDominators.cpp
index 1dfba8638c22..706c706d7527 100644
--- a/lib/CodeGen/MachineDominators.cpp
+++ b/lib/CodeGen/MachineDominators.cpp
@@ -18,12 +18,15 @@
 
 using namespace llvm;
 
+namespace llvm {
 // Always verify dominfo if expensive checking is enabled.
 #ifdef EXPENSIVE_CHECKS
-static bool VerifyMachineDomInfo = true;
+bool VerifyMachineDomInfo = true;
 #else
-static bool VerifyMachineDomInfo = false;
+bool VerifyMachineDomInfo = false;
 #endif
+} // namespace llvm
+
 static cl::opt<bool, true> VerifyMachineDomInfoX(
     "verify-machine-dom-info", cl::location(VerifyMachineDomInfo), cl::Hidden,
     cl::desc("Verify machine dominator info (time consuming)"));
@@ -64,21 +67,11 @@ void MachineDominatorTree::releaseMemory() {
 }
 
 void MachineDominatorTree::verifyAnalysis() const {
-  if (DT && VerifyMachineDomInfo) {
-    MachineFunction &F = *getRoot()->getParent();
-
-    DomTreeBase<MachineBasicBlock> OtherDT;
-    OtherDT.recalculate(F);
-    if (getRootNode()->getBlock() != OtherDT.getRootNode()->getBlock() ||
-        DT->compare(OtherDT)) {
-      errs() << "MachineDominatorTree for function " << F.getName()
-            << " is not up to date!\nComputed:\n";
-      DT->print(errs());
-      errs() << "\nActual:\n";
-      OtherDT.print(errs());
+  if (DT && VerifyMachineDomInfo)
+    if (!DT->verify(DomTreeT::VerificationLevel::Basic)) {
+      errs() << "MachineDominatorTree verification failed\n";
       abort();
     }
-  }
 }
 
 void MachineDominatorTree::print(raw_ostream &OS, const Module*) const {
diff --git a/lib/CodeGen/MachineFrameInfo.cpp b/lib/CodeGen/MachineFrameInfo.cpp
index bae3a4333bda..604f5145b1a0 100644
--- a/lib/CodeGen/MachineFrameInfo.cpp
+++ b/lib/CodeGen/MachineFrameInfo.cpp
@@ -28,25 +28,26 @@
 
 using namespace llvm;
 
-void MachineFrameInfo::ensureMaxAlignment(unsigned Align) {
+void MachineFrameInfo::ensureMaxAlignment(Align Alignment) {
   if (!StackRealignable)
-    assert(Align <= StackAlignment &&
-           "For targets without stack realignment, Align is out of limit!");
-  if (MaxAlignment < Align) MaxAlignment = Align;
+    assert(Alignment <= StackAlignment &&
+           "For targets without stack realignment, Alignment is out of limit!");
+  if (MaxAlignment < Alignment)
+    MaxAlignment = Alignment;
 }
 
 /// Clamp the alignment if requested and emit a warning.
-static inline unsigned clampStackAlignment(bool ShouldClamp, unsigned Align,
-                                           unsigned StackAlign) {
-  if (!ShouldClamp || Align <= StackAlign)
-    return Align;
-  LLVM_DEBUG(dbgs() << "Warning: requested alignment " << Align
-                    << " exceeds the stack alignment " << StackAlign
+static inline Align clampStackAlignment(bool ShouldClamp, Align Alignment,
+                                        Align StackAlignment) {
+  if (!ShouldClamp || Alignment <= StackAlignment)
+    return Alignment;
+  LLVM_DEBUG(dbgs() << "Warning: requested alignment " << Alignment.value()
+                    << " exceeds the stack alignment " << StackAlignment.value()
                     << " when stack realignment is off" << '\n');
-  return StackAlign;
+  return StackAlignment;
 }
 
-int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment,
+int MachineFrameInfo::CreateStackObject(uint64_t Size, Align Alignment,
                                         bool IsSpillSlot,
                                         const AllocaInst *Alloca,
                                         uint8_t StackID) {
@@ -61,8 +62,7 @@ int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment,
   return Index;
 }
 
-int MachineFrameInfo::CreateSpillStackObject(uint64_t Size,
-                                             unsigned Alignment) {
+int MachineFrameInfo::CreateSpillStackObject(uint64_t Size, Align Alignment) {
   Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
   CreateStackObject(Size, Alignment, true);
   int Index = (int)Objects.size() - NumFixedObjects - 1;
@@ -70,7 +70,7 @@ int MachineFrameInfo::CreateSpillStackObject(uint64_t Size,
   return Index;
 }
 
-int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment,
+int MachineFrameInfo::CreateVariableSizedObject(Align Alignment,
                                                 const AllocaInst *Alloca) {
   HasVarSizedObjects = true;
   Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
@@ -88,7 +88,8 @@ int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
   // object is 16-byte aligned. Note that unlike the non-fixed case, if the
   // stack needs realignment, we can't assume that the stack will in fact be
   // aligned.
-  unsigned Alignment = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment);
+  Align Alignment =
+      commonAlignment(ForcedRealign ? Align::None() : StackAlignment, SPOffset);
   Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
   Objects.insert(Objects.begin(),
                  StackObject(Size, Alignment, SPOffset, IsImmutable,
@@ -100,7 +101,8 @@ int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
 int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size,
                                                   int64_t SPOffset,
                                                   bool IsImmutable) {
-  unsigned Alignment = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment);
+  Align Alignment =
+      commonAlignment(ForcedRealign ? Align::None() : StackAlignment, SPOffset);
   Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
   Objects.insert(Objects.begin(),
                  StackObject(Size, Alignment, SPOffset, IsImmutable,
@@ -232,7 +234,7 @@ void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
       OS << "variable sized";
     else
       OS << "size=" << SO.Size;
-    OS << ", align=" << SO.Alignment;
+    OS << ", align=" << SO.Alignment.value();
 
     if (i < NumFixedObjects)
       OS << ", fixed";
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 4df5ce2dcedc..7d2ee230ca9f 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -78,10 +78,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "codegen"
 
-static cl::opt<unsigned>
-AlignAllFunctions("align-all-functions",
-                  cl::desc("Force the alignment of all functions."),
-                  cl::init(0), cl::Hidden);
+static cl::opt<unsigned> AlignAllFunctions(
+    "align-all-functions",
+    cl::desc("Force the alignment of all functions in log2 format (e.g. 4 "
+             "means align on 16B boundaries)."),
+    cl::init(0), cl::Hidden);
 
 static const char *getPropertyName(MachineFunctionProperties::Property Prop) {
   using P = MachineFunctionProperties::Property;
@@ -181,7 +182,7 @@ void MachineFunction::init() {
                          STI->getTargetLowering()->getPrefFunctionAlignment());
 
   if (AlignAllFunctions)
-    Alignment = AlignAllFunctions;
+    Alignment = Align(1ULL << AlignAllFunctions);
 
   JumpTableInfo = nullptr;
 
@@ -200,7 +201,7 @@ void MachineFunction::init() {
          "Target-incompatible DataLayout attached\n");
 
   PSVManager =
-    llvm::make_unique<PseudoSourceValueManager>(*(getSubtarget().
+    std::make_unique<PseudoSourceValueManager>(*(getSubtarget().
                                                   getInstrInfo()));
 }
 
@@ -823,30 +824,47 @@ try_next:;
   return FilterID;
 }
 
-void MachineFunction::addCodeViewHeapAllocSite(MachineInstr *I, MDNode *MD) {
+void MachineFunction::addCodeViewHeapAllocSite(MachineInstr *I,
+                                               const MDNode *MD) {
   MCSymbol *BeginLabel = Ctx.createTempSymbol("heapallocsite", true);
   MCSymbol *EndLabel = Ctx.createTempSymbol("heapallocsite", true);
   I->setPreInstrSymbol(*this, BeginLabel);
   I->setPostInstrSymbol(*this, EndLabel);
 
-  DIType *DI = dyn_cast<DIType>(MD);
+  const DIType *DI = dyn_cast<DIType>(MD);
   CodeViewHeapAllocSites.push_back(std::make_tuple(BeginLabel, EndLabel, DI));
 }
 
-void MachineFunction::updateCallSiteInfo(const MachineInstr *Old,
-                                         const MachineInstr *New) {
-  if (!Target.Options.EnableDebugEntryValues || Old == New)
-    return;
+void MachineFunction::moveCallSiteInfo(const MachineInstr *Old,
+                                       const MachineInstr *New) {
+  assert(New->isCall() && "Call site info refers only to call instructions!");
 
-  assert(Old->isCall() && (!New || New->isCall()) &&
-         "Call site info referes only to call instructions!");
-  CallSiteInfoMap::iterator CSIt = CallSitesInfo.find(Old);
+  CallSiteInfoMap::iterator CSIt = getCallSiteInfo(Old);
   if (CSIt == CallSitesInfo.end())
     return;
+
   CallSiteInfo CSInfo = std::move(CSIt->second);
   CallSitesInfo.erase(CSIt);
-  if (New)
-    CallSitesInfo[New] = CSInfo;
+  CallSitesInfo[New] = CSInfo;
+}
+
+void MachineFunction::eraseCallSiteInfo(const MachineInstr *MI) {
+  CallSiteInfoMap::iterator CSIt = getCallSiteInfo(MI);
+  if (CSIt == CallSitesInfo.end())
+    return;
+  CallSitesInfo.erase(CSIt);
+}
+
+void MachineFunction::copyCallSiteInfo(const MachineInstr *Old,
+                                       const MachineInstr *New) {
+  assert(New->isCall() && "Call site info refers only to call instructions!");
+
+  CallSiteInfoMap::iterator CSIt = getCallSiteInfo(Old);
+  if (CSIt == CallSitesInfo.end())
+    return;
+
+  CallSiteInfo CSInfo = CSIt->second;
+  CallSitesInfo[New] = CSInfo;
 }
 
 /// \}
@@ -881,13 +899,13 @@ unsigned MachineJumpTableInfo::getEntryAlignment(const DataLayout &TD) const {
   // alignment.
   switch (getEntryKind()) {
   case MachineJumpTableInfo::EK_BlockAddress:
-    return TD.getPointerABIAlignment(0);
+    return TD.getPointerABIAlignment(0).value();
   case MachineJumpTableInfo::EK_GPRel64BlockAddress:
-    return TD.getABIIntegerTypeAlignment(64);
+    return TD.getABIIntegerTypeAlignment(64).value();
   case MachineJumpTableInfo::EK_GPRel32BlockAddress:
   case MachineJumpTableInfo::EK_LabelDifference32:
   case MachineJumpTableInfo::EK_Custom32:
-    return TD.getABIIntegerTypeAlignment(32);
+    return TD.getABIIntegerTypeAlignment(32).value();
   case MachineJumpTableInfo::EK_Inline:
     return 1;
   }
diff --git a/lib/CodeGen/MachineFunctionPass.cpp b/lib/CodeGen/MachineFunctionPass.cpp
index 0da4cf3fc90c..03149aa7db4a 100644
--- a/lib/CodeGen/MachineFunctionPass.cpp
+++ b/lib/CodeGen/MachineFunctionPass.cpp
@@ -41,7 +41,7 @@ bool MachineFunctionPass::runOnFunction(Function &F) {
   if (F.hasAvailableExternallyLinkage())
     return false;
 
-  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>();
+  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
   MachineFunction &MF = MMI.getOrCreateMachineFunction(F);
 
   MachineFunctionProperties &MFProps = MF.getProperties();
@@ -101,8 +101,8 @@ bool MachineFunctionPass::runOnFunction(Function &F) {
 }
 
 void MachineFunctionPass::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<MachineModuleInfo>();
-  AU.addPreserved<MachineModuleInfo>();
+  AU.addRequired<MachineModuleInfoWrapperPass>();
+  AU.addPreserved<MachineModuleInfoWrapperPass>();
 
   // MachineFunctionPass preserves all LLVM IR passes, but there's no
   // high-level way to express this. Instead, just list a bunch of
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index e5c398a2d10c..fec20b2b1a05 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -636,8 +636,8 @@ bool MachineInstr::isIdenticalTo(const MachineInstr &Other,
       if (Check == IgnoreDefs)
         continue;
       else if (Check == IgnoreVRegDefs) {
-        if (!TargetRegisterInfo::isVirtualRegister(MO.getReg()) ||
-            !TargetRegisterInfo::isVirtualRegister(OMO.getReg()))
+        if (!Register::isVirtualRegister(MO.getReg()) ||
+            !Register::isVirtualRegister(OMO.getReg()))
           if (!MO.isIdenticalTo(OMO))
             return false;
       } else {
@@ -692,8 +692,8 @@ void MachineInstr::eraseFromParentAndMarkDBGValuesForRemoval() {
   for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg() || !MO.isDef())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Reg.isVirtual())
       continue;
     MRI.markUsesInDebugValueAsUndef(Reg);
   }
@@ -832,6 +832,10 @@ const DIExpression *MachineInstr::getDebugExpression() const {
   return cast<DIExpression>(getOperand(3).getMetadata());
 }
 
+bool MachineInstr::isDebugEntryValue() const {
+  return isDebugValue() && getDebugExpression()->isEntryValue();
+}
+
 const TargetRegisterClass*
 MachineInstr::getRegClassConstraint(unsigned OpIdx,
                                     const TargetInstrInfo *TII,
@@ -873,7 +877,7 @@ MachineInstr::getRegClassConstraint(unsigned OpIdx,
 }
 
 const TargetRegisterClass *MachineInstr::getRegClassConstraintEffectForVReg(
-    unsigned Reg, const TargetRegisterClass *CurRC, const TargetInstrInfo *TII,
+    Register Reg, const TargetRegisterClass *CurRC, const TargetInstrInfo *TII,
     const TargetRegisterInfo *TRI, bool ExploreBundle) const {
   // Check every operands inside the bundle if we have
   // been asked to.
@@ -890,7 +894,7 @@ const TargetRegisterClass *MachineInstr::getRegClassConstraintEffectForVReg(
 }
 
 const TargetRegisterClass *MachineInstr::getRegClassConstraintEffectForVRegImpl(
-    unsigned OpIdx, unsigned Reg, const TargetRegisterClass *CurRC,
+    unsigned OpIdx, Register Reg, const TargetRegisterClass *CurRC,
     const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) const {
   assert(CurRC && "Invalid initial register class");
   // Check if Reg is constrained by some of its use/def from MI.
@@ -933,7 +937,7 @@ unsigned MachineInstr::getBundleSize() const {
 
 /// Returns true if the MachineInstr has an implicit-use operand of exactly
 /// the given register (not considering sub/super-registers).
-bool MachineInstr::hasRegisterImplicitUseOperand(unsigned Reg) const {
+bool MachineInstr::hasRegisterImplicitUseOperand(Register Reg) const {
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = getOperand(i);
     if (MO.isReg() && MO.isUse() && MO.isImplicit() && MO.getReg() == Reg)
@@ -946,12 +950,12 @@ bool MachineInstr::hasRegisterImplicitUseOperand(unsigned Reg) const {
 /// the specific register or -1 if it is not found. It further tightens
 /// the search criteria to a use that kills the register if isKill is true.
 int MachineInstr::findRegisterUseOperandIdx(
-    unsigned Reg, bool isKill, const TargetRegisterInfo *TRI) const {
+    Register Reg, bool isKill, const TargetRegisterInfo *TRI) const {
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = getOperand(i);
     if (!MO.isReg() || !MO.isUse())
       continue;
-    unsigned MOReg = MO.getReg();
+    Register MOReg = MO.getReg();
     if (!MOReg)
       continue;
     if (MOReg == Reg || (TRI && Reg && MOReg && TRI->regsOverlap(MOReg, Reg)))
@@ -965,7 +969,7 @@ int MachineInstr::findRegisterUseOperandIdx(
 /// indicating if this instruction reads or writes Reg. This also considers
 /// partial defines.
 std::pair<bool,bool>
-MachineInstr::readsWritesVirtualRegister(unsigned Reg,
+MachineInstr::readsWritesVirtualRegister(Register Reg,
                                          SmallVectorImpl<unsigned> *Ops) const {
   bool PartDef = false; // Partial redefine.
   bool FullDef = false; // Full define.
@@ -994,9 +998,9 @@ MachineInstr::readsWritesVirtualRegister(unsigned Reg,
 /// that are not dead are skipped. If TargetRegisterInfo is non-null, then it
 /// also checks if there is a def of a super-register.
 int
-MachineInstr::findRegisterDefOperandIdx(unsigned Reg, bool isDead, bool Overlap,
+MachineInstr::findRegisterDefOperandIdx(Register Reg, bool isDead, bool Overlap,
                                         const TargetRegisterInfo *TRI) const {
-  bool isPhys = TargetRegisterInfo::isPhysicalRegister(Reg);
+  bool isPhys = Register::isPhysicalRegister(Reg);
   for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = getOperand(i);
     // Accept regmask operands when Overlap is set.
@@ -1005,10 +1009,9 @@ MachineInstr::findRegisterDefOperandIdx(unsigned Reg, bool isDead, bool Overlap,
       return i;
     if (!MO.isReg() || !MO.isDef())
       continue;
-    unsigned MOReg = MO.getReg();
+    Register MOReg = MO.getReg();
     bool Found = (MOReg == Reg);
-    if (!Found && TRI && isPhys &&
-        TargetRegisterInfo::isPhysicalRegister(MOReg)) {
+    if (!Found && TRI && isPhys && Register::isPhysicalRegister(MOReg)) {
       if (Overlap)
         Found = TRI->regsOverlap(MOReg, Reg);
       else
@@ -1142,10 +1145,10 @@ void MachineInstr::clearKillInfo() {
   }
 }
 
-void MachineInstr::substituteRegister(unsigned FromReg, unsigned ToReg,
+void MachineInstr::substituteRegister(Register FromReg, Register ToReg,
                                       unsigned SubIdx,
                                       const TargetRegisterInfo &RegInfo) {
-  if (TargetRegisterInfo::isPhysicalRegister(ToReg)) {
+  if (Register::isPhysicalRegister(ToReg)) {
     if (SubIdx)
       ToReg = RegInfo.getSubReg(ToReg, SubIdx);
     for (MachineOperand &MO : operands()) {
@@ -1165,7 +1168,7 @@ void MachineInstr::substituteRegister(unsigned FromReg, unsigned ToReg,
 /// isSafeToMove - Return true if it is safe to move this instruction. If
 /// SawStore is set to true, it means that there is a store (or call) between
 /// the instruction's location and its intended destination.
-bool MachineInstr::isSafeToMove(AliasAnalysis *AA, bool &SawStore) const {
+bool MachineInstr::isSafeToMove(AAResults *AA, bool &SawStore) const {
   // Ignore stuff that we obviously can't move.
   //
   // Treat volatile loads as stores. This is not strictly necessary for
@@ -1194,7 +1197,7 @@ bool MachineInstr::isSafeToMove(AliasAnalysis *AA, bool &SawStore) const {
   return true;
 }
 
-bool MachineInstr::mayAlias(AliasAnalysis *AA, const MachineInstr &Other,
+bool MachineInstr::mayAlias(AAResults *AA, const MachineInstr &Other,
                             bool UseTBAA) const {
   const MachineFunction *MF = getMF();
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
@@ -1206,7 +1209,7 @@ bool MachineInstr::mayAlias(AliasAnalysis *AA, const MachineInstr &Other,
     return false;
 
   // Let the target decide if memory accesses cannot possibly overlap.
-  if (TII->areMemAccessesTriviallyDisjoint(*this, Other, AA))
+  if (TII->areMemAccessesTriviallyDisjoint(*this, Other))
     return false;
 
   // FIXME: Need to handle multiple memory operands to support all targets.
@@ -1312,7 +1315,7 @@ bool MachineInstr::hasOrderedMemoryRef() const {
 /// isDereferenceableInvariantLoad - Return true if this instruction will never
 /// trap and is loading from a location whose value is invariant across a run of
 /// this function.
-bool MachineInstr::isDereferenceableInvariantLoad(AliasAnalysis *AA) const {
+bool MachineInstr::isDereferenceableInvariantLoad(AAResults *AA) const {
   // If the instruction doesn't load at all, it isn't an invariant load.
   if (!mayLoad())
     return false;
@@ -1364,7 +1367,7 @@ unsigned MachineInstr::isConstantValuePHI() const {
   assert(getNumOperands() >= 3 &&
          "It's illegal to have a PHI without source operands");
 
-  unsigned Reg = getOperand(1).getReg();
+  Register Reg = getOperand(1).getReg();
   for (unsigned i = 3, e = getNumOperands(); i < e; i += 2)
     if (getOperand(i).getReg() != Reg)
       return 0;
@@ -1726,7 +1729,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       MFI = &MF->getFrameInfo();
       Context = &MF->getFunction().getContext();
     } else {
-      CtxPtr = llvm::make_unique<LLVMContext>();
+      CtxPtr = std::make_unique<LLVMContext>();
       Context = CtxPtr.get();
     }
 
@@ -1780,10 +1783,10 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     OS << '\n';
 }
 
-bool MachineInstr::addRegisterKilled(unsigned IncomingReg,
+bool MachineInstr::addRegisterKilled(Register IncomingReg,
                                      const TargetRegisterInfo *RegInfo,
                                      bool AddIfNotFound) {
-  bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg);
+  bool isPhysReg = Register::isPhysicalRegister(IncomingReg);
   bool hasAliases = isPhysReg &&
     MCRegAliasIterator(IncomingReg, RegInfo, false).isValid();
   bool Found = false;
@@ -1799,7 +1802,7 @@ bool MachineInstr::addRegisterKilled(unsigned IncomingReg,
     if (MO.isDebug())
       continue;
 
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
 
@@ -1814,8 +1817,7 @@ bool MachineInstr::addRegisterKilled(unsigned IncomingReg,
         MO.setIsKill();
         Found = true;
       }
-    } else if (hasAliases && MO.isKill() &&
-               TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    } else if (hasAliases && MO.isKill() && Register::isPhysicalRegister(Reg)) {
       // A super-register kill already exists.
       if (RegInfo->isSuperRegister(IncomingReg, Reg))
         return true;
@@ -1847,23 +1849,23 @@ bool MachineInstr::addRegisterKilled(unsigned IncomingReg,
   return Found;
 }
 
-void MachineInstr::clearRegisterKills(unsigned Reg,
+void MachineInstr::clearRegisterKills(Register Reg,
                                       const TargetRegisterInfo *RegInfo) {
-  if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+  if (!Register::isPhysicalRegister(Reg))
     RegInfo = nullptr;
   for (MachineOperand &MO : operands()) {
     if (!MO.isReg() || !MO.isUse() || !MO.isKill())
       continue;
-    unsigned OpReg = MO.getReg();
+    Register OpReg = MO.getReg();
     if ((RegInfo && RegInfo->regsOverlap(Reg, OpReg)) || Reg == OpReg)
       MO.setIsKill(false);
   }
 }
 
-bool MachineInstr::addRegisterDead(unsigned Reg,
+bool MachineInstr::addRegisterDead(Register Reg,
                                    const TargetRegisterInfo *RegInfo,
                                    bool AddIfNotFound) {
-  bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(Reg);
+  bool isPhysReg = Register::isPhysicalRegister(Reg);
   bool hasAliases = isPhysReg &&
     MCRegAliasIterator(Reg, RegInfo, false).isValid();
   bool Found = false;
@@ -1872,7 +1874,7 @@ bool MachineInstr::addRegisterDead(unsigned Reg,
     MachineOperand &MO = getOperand(i);
     if (!MO.isReg() || !MO.isDef())
       continue;
-    unsigned MOReg = MO.getReg();
+    Register MOReg = MO.getReg();
     if (!MOReg)
       continue;
 
@@ -1880,7 +1882,7 @@ bool MachineInstr::addRegisterDead(unsigned Reg,
       MO.setIsDead();
       Found = true;
     } else if (hasAliases && MO.isDead() &&
-               TargetRegisterInfo::isPhysicalRegister(MOReg)) {
+               Register::isPhysicalRegister(MOReg)) {
       // There exists a super-register that's marked dead.
       if (RegInfo->isSuperRegister(Reg, MOReg))
         return true;
@@ -1913,7 +1915,7 @@ bool MachineInstr::addRegisterDead(unsigned Reg,
   return true;
 }
 
-void MachineInstr::clearRegisterDeads(unsigned Reg) {
+void MachineInstr::clearRegisterDeads(Register Reg) {
   for (MachineOperand &MO : operands()) {
     if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg)
       continue;
@@ -1921,7 +1923,7 @@ void MachineInstr::clearRegisterDeads(unsigned Reg) {
   }
 }
 
-void MachineInstr::setRegisterDefReadUndef(unsigned Reg, bool IsUndef) {
+void MachineInstr::setRegisterDefReadUndef(Register Reg, bool IsUndef) {
   for (MachineOperand &MO : operands()) {
     if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg || MO.getSubReg() == 0)
       continue;
@@ -1929,9 +1931,9 @@ void MachineInstr::setRegisterDefReadUndef(unsigned Reg, bool IsUndef) {
   }
 }
 
-void MachineInstr::addRegisterDefined(unsigned Reg,
+void MachineInstr::addRegisterDefined(Register Reg,
                                       const TargetRegisterInfo *RegInfo) {
-  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+  if (Register::isPhysicalRegister(Reg)) {
     MachineOperand *MO = findRegisterDefOperand(Reg, false, false, RegInfo);
     if (MO)
       return;
@@ -1947,7 +1949,7 @@ void MachineInstr::addRegisterDefined(unsigned Reg,
                                        true  /*IsImp*/));
 }
 
-void MachineInstr::setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs,
+void MachineInstr::setPhysRegsDeadExcept(ArrayRef<Register> UsedRegs,
                                          const TargetRegisterInfo &TRI) {
   bool HasRegMask = false;
   for (MachineOperand &MO : operands()) {
@@ -1956,18 +1958,19 @@ void MachineInstr::setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs,
       continue;
     }
     if (!MO.isReg() || !MO.isDef()) continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    Register Reg = MO.getReg();
+    if (!Reg.isPhysical())
+      continue;
     // If there are no uses, including partial uses, the def is dead.
     if (llvm::none_of(UsedRegs,
-                      [&](unsigned Use) { return TRI.regsOverlap(Use, Reg); }))
+                      [&](MCRegister Use) { return TRI.regsOverlap(Use, Reg); }))
       MO.setIsDead();
   }
 
   // This is a call with a register mask operand.
   // Mask clobbers are always dead, so add defs for the non-dead defines.
   if (HasRegMask)
-    for (ArrayRef<unsigned>::iterator I = UsedRegs.begin(), E = UsedRegs.end();
+    for (ArrayRef<Register>::iterator I = UsedRegs.begin(), E = UsedRegs.end();
          I != E; ++I)
       addRegisterDefined(*I, &TRI);
 }
@@ -1979,8 +1982,7 @@ MachineInstrExpressionTrait::getHashValue(const MachineInstr* const &MI) {
   HashComponents.reserve(MI->getNumOperands() + 1);
   HashComponents.push_back(MI->getOpcode());
   for (const MachineOperand &MO : MI->operands()) {
-    if (MO.isReg() && MO.isDef() &&
-        TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    if (MO.isReg() && MO.isDef() && Register::isVirtualRegister(MO.getReg()))
       continue;  // Skip virtual register defs.
 
     HashComponents.push_back(hash_value(MO));
@@ -2012,7 +2014,7 @@ void MachineInstr::emitError(StringRef Msg) const {
 
 MachineInstrBuilder llvm::BuildMI(MachineFunction &MF, const DebugLoc &DL,
                                   const MCInstrDesc &MCID, bool IsIndirect,
-                                  unsigned Reg, const MDNode *Variable,
+                                  Register Reg, const MDNode *Variable,
                                   const MDNode *Expr) {
   assert(isa<DILocalVariable>(Variable) && "not a variable");
   assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
@@ -2048,7 +2050,7 @@ MachineInstrBuilder llvm::BuildMI(MachineFunction &MF, const DebugLoc &DL,
 MachineInstrBuilder llvm::BuildMI(MachineBasicBlock &BB,
                                   MachineBasicBlock::iterator I,
                                   const DebugLoc &DL, const MCInstrDesc &MCID,
-                                  bool IsIndirect, unsigned Reg,
+                                  bool IsIndirect, Register Reg,
                                   const MDNode *Variable, const MDNode *Expr) {
   MachineFunction &MF = *BB.getParent();
   MachineInstr *MI = BuildMI(MF, DL, MCID, IsIndirect, Reg, Variable, Expr);
@@ -2118,10 +2120,24 @@ void MachineInstr::collectDebugValues(
   }
 }
 
-void MachineInstr::changeDebugValuesDefReg(unsigned Reg) {
+void MachineInstr::changeDebugValuesDefReg(Register Reg) {
   // Collect matching debug values.
   SmallVector<MachineInstr *, 2> DbgValues;
-  collectDebugValues(DbgValues);
+
+  if (!getOperand(0).isReg())
+    return;
+
+  unsigned DefReg = getOperand(0).getReg();
+  auto *MRI = getRegInfo();
+  for (auto &MO : MRI->use_operands(DefReg)) {
+    auto *DI = MO.getParent();
+    if (!DI->isDebugValue())
+      continue;
+    if (DI->getOperand(0).isReg() &&
+        DI->getOperand(0).getReg() == DefReg){
+      DbgValues.push_back(DI);
+    }
+  }
 
   // Propagate Reg to debug value instructions.
   for (auto *DBI : DbgValues)
diff --git a/lib/CodeGen/MachineInstrBundle.cpp b/lib/CodeGen/MachineInstrBundle.cpp
index 32e266e9401e..feb849ced353 100644
--- a/lib/CodeGen/MachineInstrBundle.cpp
+++ b/lib/CodeGen/MachineInstrBundle.cpp
@@ -154,10 +154,10 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
         continue;
       }
 
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg)
         continue;
-      assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+
       if (LocalDefSet.count(Reg)) {
         MO.setIsInternalRead();
         if (MO.isKill())
@@ -177,7 +177,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
 
     for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
       MachineOperand &MO = *Defs[i];
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg)
         continue;
 
@@ -194,7 +194,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
           DeadDefSet.erase(Reg);
       }
 
-      if (!MO.isDead()) {
+      if (!MO.isDead() && Register::isPhysicalRegister(Reg)) {
         for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
           unsigned SubReg = *SubRegs;
           if (LocalDefSet.insert(SubReg).second)
@@ -316,7 +316,7 @@ MachineOperandIteratorBase::analyzePhysReg(unsigned Reg,
   bool AllDefsDead = true;
   PhysRegInfo PRI = {false, false, false, false, false, false, false, false};
 
-  assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+  assert(Register::isPhysicalRegister(Reg) &&
          "analyzePhysReg not given a physical register!");
   for (; isValid(); ++*this) {
     MachineOperand &MO = deref();
@@ -329,8 +329,8 @@ MachineOperandIteratorBase::analyzePhysReg(unsigned Reg,
     if (!MO.isReg())
       continue;
 
-    unsigned MOReg = MO.getReg();
-    if (!MOReg || !TargetRegisterInfo::isPhysicalRegister(MOReg))
+    Register MOReg = MO.getReg();
+    if (!MOReg || !Register::isPhysicalRegister(MOReg))
       continue;
 
     if (!TRI->regsOverlap(MOReg, Reg))
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 1107e609c258..6a898ff6ef88 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -153,7 +153,6 @@ namespace {
       AU.addRequired<MachineDominatorTree>();
       AU.addRequired<AAResultsWrapperPass>();
       AU.addPreserved<MachineLoopInfo>();
-      AU.addPreserved<MachineDominatorTree>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -424,10 +423,10 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI,
 
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
-    assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+    assert(Register::isPhysicalRegister(Reg) &&
            "Not expecting virtual register!");
 
     if (!MO.isDef()) {
@@ -526,7 +525,7 @@ void MachineLICMBase::HoistRegionPostRA() {
     for (const MachineOperand &MO : TI->operands()) {
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       if (!Reg)
         continue;
       for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
@@ -554,7 +553,7 @@ void MachineLICMBase::HoistRegionPostRA() {
       for (const MachineOperand &MO : MI->operands()) {
         if (!MO.isReg() || MO.isDef() || !MO.getReg())
           continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (PhysRegDefs.test(Reg) ||
             PhysRegClobbers.test(Reg)) {
           // If it's using a non-loop-invariant register, then it's obviously
@@ -852,8 +851,8 @@ MachineLICMBase::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen,
     const MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg() || MO.isImplicit())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
       continue;
 
     // FIXME: It seems bad to use RegSeen only for some of these calculations.
@@ -922,12 +921,12 @@ static bool isInvariantStore(const MachineInstr &MI,
   // Check that all register operands are caller-preserved physical registers.
   for (const MachineOperand &MO : MI.operands()) {
     if (MO.isReg()) {
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       // If operand is a virtual register, check if it comes from a copy of a
       // physical register.
-      if (TargetRegisterInfo::isVirtualRegister(Reg))
+      if (Register::isVirtualRegister(Reg))
         Reg = TRI->lookThruCopyLike(MO.getReg(), MRI);
-      if (TargetRegisterInfo::isVirtualRegister(Reg))
+      if (Register::isVirtualRegister(Reg))
         return false;
       if (!TRI->isCallerPreservedPhysReg(Reg, *MI.getMF()))
         return false;
@@ -955,17 +954,17 @@ static bool isCopyFeedingInvariantStore(const MachineInstr &MI,
 
   const MachineFunction *MF = MI.getMF();
   // Check that we are copying a constant physical register.
-  unsigned CopySrcReg = MI.getOperand(1).getReg();
-  if (TargetRegisterInfo::isVirtualRegister(CopySrcReg))
+  Register CopySrcReg = MI.getOperand(1).getReg();
+  if (Register::isVirtualRegister(CopySrcReg))
     return false;
 
   if (!TRI->isCallerPreservedPhysReg(CopySrcReg, *MF))
     return false;
 
-  unsigned CopyDstReg = MI.getOperand(0).getReg();
+  Register CopyDstReg = MI.getOperand(0).getReg();
   // Check if any of the uses of the copy are invariant stores.
-  assert (TargetRegisterInfo::isVirtualRegister(CopyDstReg) &&
-          "copy dst is not a virtual reg");
+  assert(Register::isVirtualRegister(CopyDstReg) &&
+         "copy dst is not a virtual reg");
 
   for (MachineInstr &UseMI : MRI->use_instructions(CopyDstReg)) {
     if (UseMI.mayStore() && isInvariantStore(UseMI, TRI, MRI))
@@ -1010,11 +1009,11 @@ bool MachineLICMBase::IsLoopInvariantInst(MachineInstr &I) {
     if (!MO.isReg())
       continue;
 
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0) continue;
 
     // Don't hoist an instruction that uses or defines a physical register.
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    if (Register::isPhysicalRegister(Reg)) {
       if (MO.isUse()) {
         // If the physreg has no defs anywhere, it's just an ambient register
         // and we can freely move its uses. Alternatively, if it's allocatable,
@@ -1061,8 +1060,8 @@ bool MachineLICMBase::HasLoopPHIUse(const MachineInstr *MI) const {
     for (const MachineOperand &MO : MI->operands()) {
       if (!MO.isReg() || !MO.isDef())
         continue;
-      unsigned Reg = MO.getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      Register Reg = MO.getReg();
+      if (!Register::isVirtualRegister(Reg))
         continue;
       for (MachineInstr &UseMI : MRI->use_instructions(Reg)) {
         // A PHI may cause a copy to be inserted.
@@ -1104,7 +1103,7 @@ bool MachineLICMBase::HasHighOperandLatency(MachineInstr &MI,
       const MachineOperand &MO = UseMI.getOperand(i);
       if (!MO.isReg() || !MO.isUse())
         continue;
-      unsigned MOReg = MO.getReg();
+      Register MOReg = MO.getReg();
       if (MOReg != Reg)
         continue;
 
@@ -1132,8 +1131,8 @@ bool MachineLICMBase::IsCheapInstruction(MachineInstr &MI) const {
     if (!DefMO.isReg() || !DefMO.isDef())
       continue;
     --NumDefs;
-    unsigned Reg = DefMO.getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    Register Reg = DefMO.getReg();
+    if (Register::isPhysicalRegister(Reg))
       continue;
 
     if (!TII->hasLowDefLatency(SchedModel, MI, i))
@@ -1225,8 +1224,8 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI) {
     const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || MO.isImplicit())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
       continue;
     if (MO.isDef() && HasHighOperandLatency(MI, i, Reg)) {
       LLVM_DEBUG(dbgs() << "Hoist High Latency: " << MI);
@@ -1304,7 +1303,7 @@ MachineInstr *MachineLICMBase::ExtractHoistableLoad(MachineInstr *MI) {
   MachineFunction &MF = *MI->getMF();
   const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI, MF);
   // Ok, we're unfolding. Create a temporary register and do the unfold.
-  unsigned Reg = MRI->createVirtualRegister(RC);
+  Register Reg = MRI->createVirtualRegister(RC);
 
   SmallVector<MachineInstr *, 2> NewMIs;
   bool Success = TII->unfoldMemoryOperand(MF, *MI, Reg,
@@ -1378,20 +1377,20 @@ bool MachineLICMBase::EliminateCSE(MachineInstr *MI,
 
       // Physical registers may not differ here.
       assert((!MO.isReg() || MO.getReg() == 0 ||
-              !TargetRegisterInfo::isPhysicalRegister(MO.getReg()) ||
+              !Register::isPhysicalRegister(MO.getReg()) ||
               MO.getReg() == Dup->getOperand(i).getReg()) &&
              "Instructions with different phys regs are not identical!");
 
       if (MO.isReg() && MO.isDef() &&
-          !TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+          !Register::isPhysicalRegister(MO.getReg()))
         Defs.push_back(i);
     }
 
     SmallVector<const TargetRegisterClass*, 2> OrigRCs;
     for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
       unsigned Idx = Defs[i];
-      unsigned Reg = MI->getOperand(Idx).getReg();
-      unsigned DupReg = Dup->getOperand(Idx).getReg();
+      Register Reg = MI->getOperand(Idx).getReg();
+      Register DupReg = Dup->getOperand(Idx).getReg();
       OrigRCs.push_back(MRI->getRegClass(DupReg));
 
       if (!MRI->constrainRegClass(DupReg, MRI->getRegClass(Reg))) {
@@ -1403,8 +1402,8 @@ bool MachineLICMBase::EliminateCSE(MachineInstr *MI,
     }
 
     for (unsigned Idx : Defs) {
-      unsigned Reg = MI->getOperand(Idx).getReg();
-      unsigned DupReg = Dup->getOperand(Idx).getReg();
+      Register Reg = MI->getOperand(Idx).getReg();
+      Register DupReg = Dup->getOperand(Idx).getReg();
       MRI->replaceRegWith(Reg, DupReg);
       MRI->clearKillFlags(DupReg);
     }
diff --git a/lib/CodeGen/MachineLoopUtils.cpp b/lib/CodeGen/MachineLoopUtils.cpp
new file mode 100644
index 000000000000..e074b76082f0
--- /dev/null
+++ b/lib/CodeGen/MachineLoopUtils.cpp
@@ -0,0 +1,132 @@
+//=- MachineLoopUtils.cpp - Functions for manipulating loops ----------------=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineLoopUtils.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+using namespace llvm;
+
+namespace {
+// MI's parent and BB are clones of each other. Find the equivalent copy of MI
+// in BB.
+MachineInstr &findEquivalentInstruction(MachineInstr &MI,
+                                        MachineBasicBlock *BB) {
+  MachineBasicBlock *PB = MI.getParent();
+  unsigned Offset = std::distance(PB->instr_begin(), MachineBasicBlock::instr_iterator(MI));
+  return *std::next(BB->instr_begin(), Offset);
+}
+} // namespace
+
+MachineBasicBlock *llvm::PeelSingleBlockLoop(LoopPeelDirection Direction,
+                                             MachineBasicBlock *Loop,
+                                             MachineRegisterInfo &MRI,
+                                             const TargetInstrInfo *TII) {
+  MachineFunction &MF = *Loop->getParent();
+  MachineBasicBlock *Preheader = *Loop->pred_begin();
+  if (Preheader == Loop)
+    Preheader = *std::next(Loop->pred_begin());
+  MachineBasicBlock *Exit = *Loop->succ_begin();
+  if (Exit == Loop)
+    Exit = *std::next(Loop->succ_begin());
+
+  MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(Loop->getBasicBlock());
+  if (Direction == LPD_Front)
+    MF.insert(Loop->getIterator(), NewBB);
+  else
+    MF.insert(std::next(Loop->getIterator()), NewBB);
+
+  // FIXME: Add DenseMapInfo trait for Register so we can use it as a key.
+  DenseMap<unsigned, Register> Remaps;
+  auto InsertPt = NewBB->end();
+  for (MachineInstr &MI : *Loop) {
+    MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
+    NewBB->insert(InsertPt, NewMI);
+    for (MachineOperand &MO : NewMI->defs()) {
+      Register OrigR = MO.getReg();
+      if (OrigR.isPhysical())
+        continue;
+      Register &R = Remaps[OrigR];
+      R = MRI.createVirtualRegister(MRI.getRegClass(OrigR));
+      MO.setReg(R);
+
+      if (Direction == LPD_Back) {
+        // Replace all uses outside the original loop with the new register.
+        // FIXME: is the use_iterator stable enough to mutate register uses
+        // while iterating?
+        SmallVector<MachineOperand *, 4> Uses;
+        for (auto &Use : MRI.use_operands(OrigR))
+          if (Use.getParent()->getParent() != Loop)
+            Uses.push_back(&Use);
+        for (auto *Use : Uses) {
+          MRI.constrainRegClass(R, MRI.getRegClass(Use->getReg()));
+          Use->setReg(R);
+        }
+      }
+    }
+  }
+
+  for (auto I = NewBB->getFirstNonPHI(); I != NewBB->end(); ++I)
+    for (MachineOperand &MO : I->uses())
+      if (MO.isReg() && Remaps.count(MO.getReg()))
+        MO.setReg(Remaps[MO.getReg()]);
+
+  for (auto I = NewBB->begin(); I->isPHI(); ++I) {
+    MachineInstr &MI = *I;
+    unsigned LoopRegIdx = 3, InitRegIdx = 1;
+    if (MI.getOperand(2).getMBB() != Preheader)
+      std::swap(LoopRegIdx, InitRegIdx);
+    MachineInstr &OrigPhi = findEquivalentInstruction(MI, Loop);
+    assert(OrigPhi.isPHI());
+    if (Direction == LPD_Front) {
+      // When peeling front, we are only left with the initial value from the
+      // preheader.
+      Register R = MI.getOperand(LoopRegIdx).getReg();
+      if (Remaps.count(R))
+        R = Remaps[R];
+      OrigPhi.getOperand(InitRegIdx).setReg(R);
+      MI.RemoveOperand(LoopRegIdx + 1);
+      MI.RemoveOperand(LoopRegIdx + 0);
+    } else {
+      // When peeling back, the initial value is the loop-carried value from
+      // the original loop.
+      Register LoopReg = OrigPhi.getOperand(LoopRegIdx).getReg();
+      MI.getOperand(LoopRegIdx).setReg(LoopReg);
+      MI.RemoveOperand(InitRegIdx + 1);
+      MI.RemoveOperand(InitRegIdx + 0);
+    }
+  }
+
+  DebugLoc DL;
+  if (Direction == LPD_Front) {
+    Preheader->replaceSuccessor(Loop, NewBB);
+    NewBB->addSuccessor(Loop);
+    Loop->replacePhiUsesWith(Preheader, NewBB);
+    if (TII->removeBranch(*Preheader) > 0)
+      TII->insertBranch(*Preheader, NewBB, nullptr, {}, DL);
+    TII->removeBranch(*NewBB);
+    TII->insertBranch(*NewBB, Loop, nullptr, {}, DL);
+  } else {
+    Loop->replaceSuccessor(Exit, NewBB);
+    Exit->replacePhiUsesWith(Loop, NewBB);
+    NewBB->addSuccessor(Exit);
+
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+    SmallVector<MachineOperand, 4> Cond;
+    bool CanAnalyzeBr = !TII->analyzeBranch(*Loop, TBB, FBB, Cond);
+    (void)CanAnalyzeBr;
+    assert(CanAnalyzeBr && "Must be able to analyze the loop branch!");
+    TII->removeBranch(*Loop);
+    TII->insertBranch(*Loop, TBB == Exit ? NewBB : TBB,
+                      FBB == Exit ? NewBB : FBB, Cond, DL);
+    if (TII->removeBranch(*NewBB) > 0)
+      TII->insertBranch(*NewBB, Exit, nullptr, {}, DL);
+  }
+
+  return NewBB;
+}
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index aadcd7319799..e0b4e9cac229 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -36,11 +36,6 @@
 using namespace llvm;
 using namespace llvm::dwarf;
 
-// Handle the Pass registration stuff necessary to use DataLayout's.
-INITIALIZE_PASS(MachineModuleInfo, "machinemoduleinfo",
-                "Machine Module Information", false, false)
-char MachineModuleInfo::ID = 0;
-
 // Out of line virtual method.
 MachineModuleInfoImpl::~MachineModuleInfoImpl() = default;
 
@@ -121,7 +116,7 @@ ArrayRef<MCSymbol *> MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) {
   BBCallbacks.back().setMap(this);
   Entry.Index = BBCallbacks.size() - 1;
   Entry.Fn = BB->getParent();
-  Entry.Symbols.push_back(Context.createTempSymbol());
+  Entry.Symbols.push_back(Context.createTempSymbol(!BB->hasAddressTaken()));
   return Entry.Symbols;
 }
 
@@ -193,27 +188,15 @@ void MMIAddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) {
   Map->UpdateForRAUWBlock(cast<BasicBlock>(getValPtr()), cast<BasicBlock>(V2));
 }
 
-MachineModuleInfo::MachineModuleInfo(const LLVMTargetMachine *TM)
-  : ImmutablePass(ID), TM(*TM),
-    Context(TM->getMCAsmInfo(), TM->getMCRegisterInfo(),
-            TM->getObjFileLowering(), nullptr, false) {
-  initializeMachineModuleInfoPass(*PassRegistry::getPassRegistry());
-}
-
-MachineModuleInfo::~MachineModuleInfo() = default;
-
-bool MachineModuleInfo::doInitialization(Module &M) {
+void MachineModuleInfo::initialize() {
   ObjFileMMI = nullptr;
   CurCallSite = 0;
   UsesMSVCFloatingPoint = UsesMorestackAddr = false;
   HasSplitStack = HasNosplitStack = false;
   AddrLabelSymbols = nullptr;
-  TheModule = &M;
-  DbgInfoAvailable = !llvm::empty(M.debug_compile_units());
-  return false;
 }
 
-bool MachineModuleInfo::doFinalization(Module &M) {
+void MachineModuleInfo::finalize() {
   Personalities.clear();
 
   delete AddrLabelSymbols;
@@ -223,10 +206,30 @@ bool MachineModuleInfo::doFinalization(Module &M) {
 
   delete ObjFileMMI;
   ObjFileMMI = nullptr;
+}
 
-  return false;
+MachineModuleInfo::MachineModuleInfo(MachineModuleInfo &&MMI)
+    : TM(std::move(MMI.TM)),
+      Context(MMI.TM.getMCAsmInfo(), MMI.TM.getMCRegisterInfo(),
+              MMI.TM.getObjFileLowering(), nullptr, nullptr, false) {
+  ObjFileMMI = MMI.ObjFileMMI;
+  CurCallSite = MMI.CurCallSite;
+  UsesMSVCFloatingPoint = MMI.UsesMSVCFloatingPoint;
+  UsesMorestackAddr = MMI.UsesMorestackAddr;
+  HasSplitStack = MMI.HasSplitStack;
+  HasNosplitStack = MMI.HasNosplitStack;
+  AddrLabelSymbols = MMI.AddrLabelSymbols;
+  TheModule = MMI.TheModule;
 }
 
+MachineModuleInfo::MachineModuleInfo(const LLVMTargetMachine *TM)
+    : TM(*TM), Context(TM->getMCAsmInfo(), TM->getMCRegisterInfo(),
+                       TM->getObjFileLowering(), nullptr, nullptr, false) {
+  initialize();
+}
+
+MachineModuleInfo::~MachineModuleInfo() { finalize(); }
+
 //===- Address of Block Management ----------------------------------------===//
 
 ArrayRef<MCSymbol *>
@@ -305,12 +308,13 @@ public:
   FreeMachineFunction() : FunctionPass(ID) {}
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineModuleInfo>();
-    AU.addPreserved<MachineModuleInfo>();
+    AU.addRequired<MachineModuleInfoWrapperPass>();
+    AU.addPreserved<MachineModuleInfoWrapperPass>();
   }
 
   bool runOnFunction(Function &F) override {
-    MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>();
+    MachineModuleInfo &MMI =
+        getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
     MMI.deleteMachineFunctionFor(F);
     return true;
   }
@@ -327,3 +331,36 @@ char FreeMachineFunction::ID;
 FunctionPass *llvm::createFreeMachineFunctionPass() {
   return new FreeMachineFunction();
 }
+
+MachineModuleInfoWrapperPass::MachineModuleInfoWrapperPass(
+    const LLVMTargetMachine *TM)
+    : ImmutablePass(ID), MMI(TM) {
+  initializeMachineModuleInfoWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+// Handle the Pass registration stuff necessary to use DataLayout's.
+INITIALIZE_PASS(MachineModuleInfoWrapperPass, "machinemoduleinfo",
+                "Machine Module Information", false, false)
+char MachineModuleInfoWrapperPass::ID = 0;
+
+bool MachineModuleInfoWrapperPass::doInitialization(Module &M) {
+  MMI.initialize();
+  MMI.TheModule = &M;
+  MMI.DbgInfoAvailable = !M.debug_compile_units().empty();
+  return false;
+}
+
+bool MachineModuleInfoWrapperPass::doFinalization(Module &M) {
+  MMI.finalize();
+  return false;
+}
+
+AnalysisKey MachineModuleAnalysis::Key;
+
+MachineModuleInfo MachineModuleAnalysis::run(Module &M,
+                                             ModuleAnalysisManager &) {
+  MachineModuleInfo MMI(TM);
+  MMI.TheModule = &M;
+  MMI.DbgInfoAvailable = !M.debug_compile_units().empty();
+  return MMI;
+}
diff --git a/lib/CodeGen/MachineOperand.cpp b/lib/CodeGen/MachineOperand.cpp
index 4fa4ea7f6cf5..8b19501ec3cf 100644
--- a/lib/CodeGen/MachineOperand.cpp
+++ b/lib/CodeGen/MachineOperand.cpp
@@ -49,7 +49,7 @@ static MachineFunction *getMFIfAvailable(MachineOperand &MO) {
       getMFIfAvailable(const_cast<const MachineOperand &>(MO)));
 }
 
-void MachineOperand::setReg(unsigned Reg) {
+void MachineOperand::setReg(Register Reg) {
   if (getReg() == Reg)
     return; // No change.
 
@@ -71,9 +71,9 @@ void MachineOperand::setReg(unsigned Reg) {
   SmallContents.RegNo = Reg;
 }
 
-void MachineOperand::substVirtReg(unsigned Reg, unsigned SubIdx,
+void MachineOperand::substVirtReg(Register Reg, unsigned SubIdx,
                                   const TargetRegisterInfo &TRI) {
-  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  assert(Reg.isVirtual());
   if (SubIdx && getSubReg())
     SubIdx = TRI.composeSubRegIndices(SubIdx, getSubReg());
   setReg(Reg);
@@ -81,8 +81,8 @@ void MachineOperand::substVirtReg(unsigned Reg, unsigned SubIdx,
     setSubReg(SubIdx);
 }
 
-void MachineOperand::substPhysReg(unsigned Reg, const TargetRegisterInfo &TRI) {
-  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+void MachineOperand::substPhysReg(MCRegister Reg, const TargetRegisterInfo &TRI) {
+  assert(Reg.isPhysical());
   if (getSubReg()) {
     Reg = TRI.getSubReg(Reg, getSubReg());
     // Note that getSubReg() may return 0 if the sub-register doesn't exist.
@@ -114,7 +114,7 @@ void MachineOperand::setIsDef(bool Val) {
 
 bool MachineOperand::isRenamable() const {
   assert(isReg() && "Wrong MachineOperand accessor");
-  assert(TargetRegisterInfo::isPhysicalRegister(getReg()) &&
+  assert(Register::isPhysicalRegister(getReg()) &&
          "isRenamable should only be checked on physical registers");
   if (!IsRenamable)
     return false;
@@ -132,7 +132,7 @@ bool MachineOperand::isRenamable() const {
 
 void MachineOperand::setIsRenamable(bool Val) {
   assert(isReg() && "Wrong MachineOperand accessor");
-  assert(TargetRegisterInfo::isPhysicalRegister(getReg()) &&
+  assert(Register::isPhysicalRegister(getReg()) &&
          "setIsRenamable should only be called on physical registers");
   IsRenamable = Val;
 }
@@ -169,7 +169,7 @@ void MachineOperand::ChangeToFPImmediate(const ConstantFP *FPImm) {
 }
 
 void MachineOperand::ChangeToES(const char *SymName,
-                                unsigned char TargetFlags) {
+                                unsigned TargetFlags) {
   assert((!isReg() || !isTied()) &&
          "Cannot change a tied operand into an external symbol");
 
@@ -182,7 +182,7 @@ void MachineOperand::ChangeToES(const char *SymName,
 }
 
 void MachineOperand::ChangeToGA(const GlobalValue *GV, int64_t Offset,
-                                unsigned char TargetFlags) {
+                                unsigned TargetFlags) {
   assert((!isReg() || !isTied()) &&
          "Cannot change a tied operand into a global address");
 
@@ -215,7 +215,7 @@ void MachineOperand::ChangeToFrameIndex(int Idx) {
 }
 
 void MachineOperand::ChangeToTargetIndex(unsigned Idx, int64_t Offset,
-                                         unsigned char TargetFlags) {
+                                         unsigned TargetFlags) {
   assert((!isReg() || !isTied()) &&
          "Cannot change a tied operand into a FrameIndex");
 
@@ -230,7 +230,7 @@ void MachineOperand::ChangeToTargetIndex(unsigned Idx, int64_t Offset,
 /// ChangeToRegister - Replace this operand with a new register operand of
 /// the specified value.  If an operand is known to be an register already,
 /// the setReg method should be used.
-void MachineOperand::ChangeToRegister(unsigned Reg, bool isDef, bool isImp,
+void MachineOperand::ChangeToRegister(Register Reg, bool isDef, bool isImp,
                                       bool isKill, bool isDead, bool isUndef,
                                       bool isDebug) {
   MachineRegisterInfo *RegInfo = nullptr;
@@ -333,6 +333,8 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
     return getIntrinsicID() == Other.getIntrinsicID();
   case MachineOperand::MO_Predicate:
     return getPredicate() == Other.getPredicate();
+  case MachineOperand::MO_ShuffleMask:
+    return getShuffleMask() == Other.getShuffleMask();
   }
   llvm_unreachable("Invalid machine operand type");
 }
@@ -381,6 +383,8 @@ hash_code llvm::hash_value(const MachineOperand &MO) {
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIntrinsicID());
   case MachineOperand::MO_Predicate:
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getPredicate());
+  case MachineOperand::MO_ShuffleMask:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getShuffleMask());
   }
   llvm_unreachable("Invalid machine operand type");
 }
@@ -425,12 +429,10 @@ static void printCFIRegister(unsigned DwarfReg, raw_ostream &OS,
     return;
   }
 
-  int Reg = TRI->getLLVMRegNum(DwarfReg, true);
-  if (Reg == -1) {
+  if (Optional<unsigned> Reg = TRI->getLLVMRegNum(DwarfReg, true))
+    OS << printReg(*Reg, TRI);
+  else
     OS << "<badreg>";
-    return;
-  }
-  OS << printReg(Reg, TRI);
 }
 
 static void printIRBlockReference(raw_ostream &OS, const BasicBlock &BB,
@@ -746,7 +748,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
   printTargetFlags(OS, *this);
   switch (getType()) {
   case MachineOperand::MO_Register: {
-    unsigned Reg = getReg();
+    Register Reg = getReg();
     if (isImplicit())
       OS << (isDef() ? "implicit-def " : "implicit ");
     else if (PrintDef && isDef())
@@ -762,13 +764,13 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       OS << "undef ";
     if (isEarlyClobber())
       OS << "early-clobber ";
-    if (TargetRegisterInfo::isPhysicalRegister(getReg()) && isRenamable())
+    if (Register::isPhysicalRegister(getReg()) && isRenamable())
       OS << "renamable ";
     // isDebug() is exactly true for register operands of a DBG_VALUE. So we
     // simply infer it when parsing and do not need to print it.
 
     const MachineRegisterInfo *MRI = nullptr;
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (Register::isVirtualRegister(Reg)) {
       if (const MachineFunction *MF = getMFIfAvailable(*this)) {
         MRI = &MF->getRegInfo();
       }
@@ -783,7 +785,7 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
         OS << ".subreg" << SubReg;
     }
     // Print the register class / bank.
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (Register::isVirtualRegister(Reg)) {
       if (const MachineFunction *MF = getMFIfAvailable(*this)) {
         const MachineRegisterInfo &MRI = MF->getRegInfo();
         if (IsStandalone || !PrintDef || MRI.def_empty(Reg)) {
@@ -936,6 +938,20 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
        << CmpInst::getPredicateName(Pred) << ')';
     break;
   }
+  case MachineOperand::MO_ShuffleMask:
+    OS << "shufflemask(";
+    const Constant* C = getShuffleMask();
+    const int NumElts = C->getType()->getVectorNumElements();
+
+    StringRef Separator;
+    for (int I = 0; I != NumElts; ++I) {
+      OS << Separator;
+      C->getAggregateElement(I)->printAsOperand(OS, false, MST);
+      Separator = ", ";
+    }
+
+    OS << ')';
+    break;
   }
 }
 
@@ -963,7 +979,8 @@ bool MachinePointerInfo::isDereferenceable(unsigned Size, LLVMContext &C,
     return false;
 
   return isDereferenceableAndAlignedPointer(
-      BasePtr, 1, APInt(DL.getPointerSizeInBits(), Offset + Size), DL);
+      BasePtr, Align::None(), APInt(DL.getPointerSizeInBits(), Offset + Size),
+      DL);
 }
 
 /// getConstantPool - Return a MachinePointerInfo record that refers to the
@@ -1049,17 +1066,6 @@ uint64_t MachineMemOperand::getAlignment() const {
   return MinAlign(getBaseAlignment(), getOffset());
 }
 
-void MachineMemOperand::print(raw_ostream &OS) const {
-  ModuleSlotTracker DummyMST(nullptr);
-  print(OS, DummyMST);
-}
-
-void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST) const {
-  SmallVector<StringRef, 0> SSNs;
-  LLVMContext Ctx;
-  print(OS, MST, SSNs, Ctx, nullptr, nullptr);
-}
-
 void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
                               SmallVectorImpl<StringRef> &SSNs,
                               const LLVMContext &Context,
diff --git a/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp b/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
index 27db9106b337..b82403ae1b85 100644
--- a/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
+++ b/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
@@ -76,7 +76,7 @@ bool MachineOptimizationRemarkEmitterPass::runOnMachineFunction(
   else
     MBFI = nullptr;
 
-  ORE = llvm::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI);
+  ORE = std::make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI);
   return false;
 }
 
diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
index 80a235aeaa5c..8cd66825a58a 100644
--- a/lib/CodeGen/MachineOutliner.cpp
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -846,8 +846,8 @@ struct MachineOutliner : public ModulePass {
   StringRef getPassName() const override { return "Machine Outliner"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineModuleInfo>();
-    AU.addPreserved<MachineModuleInfo>();
+    AU.addRequired<MachineModuleInfoWrapperPass>();
+    AU.addPreserved<MachineModuleInfoWrapperPass>();
     AU.setPreservesAll();
     ModulePass::getAnalysisUsage(AU);
   }
@@ -1128,7 +1128,7 @@ MachineOutliner::createOutlinedFunction(Module &M, OutlinedFunction &OF,
   IRBuilder<> Builder(EntryBB);
   Builder.CreateRetVoid();
 
-  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>();
+  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
   MachineFunction &MF = MMI.getOrCreateMachineFunction(*F);
   MachineBasicBlock &MBB = *MF.CreateMachineBasicBlock();
   const TargetSubtargetInfo &STI = MF.getSubtarget();
@@ -1260,7 +1260,7 @@ bool MachineOutliner::outline(Module &M,
                   true /* isImp = true */));
           }
           if (MI.isCall())
-            MI.getMF()->updateCallSiteInfo(&MI);
+            MI.getMF()->eraseCallSiteInfo(&MI);
         };
         // Copy over the defs in the outlined range.
         // First inst in outlined range <-- Anything that's defined in this
@@ -1303,6 +1303,12 @@ void MachineOutliner::populateMapper(InstructionMapper &Mapper, Module &M,
     if (F.empty())
       continue;
 
+    // Disable outlining from noreturn functions right now. Noreturn requires
+    // special handling for the case where what we are outlining could be a
+    // tail call.
+    if (F.hasFnAttribute(Attribute::NoReturn))
+      continue;
+
     // There's something in F. Check if it has a MachineFunction associated with
     // it.
     MachineFunction *MF = MMI.getMachineFunction(F);
@@ -1421,7 +1427,7 @@ bool MachineOutliner::runOnModule(Module &M) {
   if (M.empty())
     return false;
 
-  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>();
+  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
 
   // If the user passed -enable-machine-outliner=always or
   // -enable-machine-outliner, the pass will run on all functions in the module.
diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index 54df522d371a..89c9f6093a97 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -56,6 +56,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachinePipeliner.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ModuloSchedule.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
@@ -153,6 +154,17 @@ static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
 static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
                                       cl::init(false));
 
+static cl::opt<bool> EmitTestAnnotations(
+    "pipeliner-annotate-for-testing", cl::Hidden, cl::init(false),
+    cl::desc("Instead of emitting the pipelined code, annotate instructions "
+             "with the generated schedule for feeding into the "
+             "-modulo-schedule-test pass"));
+
+static cl::opt<bool> ExperimentalCodeGen(
+    "pipeliner-experimental-cg", cl::Hidden, cl::init(false),
+    cl::desc(
+        "Use the experimental peeling code generator for software pipelining"));
+
 namespace llvm {
 
 // A command line option to enable the CopyToPhi DAG mutation.
@@ -314,7 +326,7 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
 
   LI.LoopInductionVar = nullptr;
   LI.LoopCompare = nullptr;
-  if (TII->analyzeLoop(L, LI.LoopInductionVar, LI.LoopCompare)) {
+  if (!TII->analyzeLoopForPipelining(L.getTopBlock())) {
     LLVM_DEBUG(
         dbgs() << "Unable to analyzeLoop, can NOT pipeline current Loop\n");
     NumFailLoop++;
@@ -349,7 +361,7 @@ void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
 
       // If the operand uses a subregister, replace it with a new register
       // without subregisters, and generate a copy to the new register.
-      unsigned NewReg = MRI.createVirtualRegister(RC);
+      Register NewReg = MRI.createVirtualRegister(RC);
       MachineBasicBlock &PredB = *PI.getOperand(i+1).getMBB();
       MachineBasicBlock::iterator At = PredB.getFirstTerminator();
       const DebugLoc &DL = PredB.findDebugLoc(At);
@@ -515,14 +527,49 @@ void SwingSchedulerDAG::schedule() {
     return;
   }
 
-  generatePipelinedLoop(Schedule);
+  // Generate the schedule as a ModuloSchedule.
+  DenseMap<MachineInstr *, int> Cycles, Stages;
+  std::vector<MachineInstr *> OrderedInsts;
+  for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
+       ++Cycle) {
+    for (SUnit *SU : Schedule.getInstructions(Cycle)) {
+      OrderedInsts.push_back(SU->getInstr());
+      Cycles[SU->getInstr()] = Cycle;
+      Stages[SU->getInstr()] = Schedule.stageScheduled(SU);
+    }
+  }
+  DenseMap<MachineInstr *, std::pair<unsigned, int64_t>> NewInstrChanges;
+  for (auto &KV : NewMIs) {
+    Cycles[KV.first] = Cycles[KV.second];
+    Stages[KV.first] = Stages[KV.second];
+    NewInstrChanges[KV.first] = InstrChanges[getSUnit(KV.first)];
+  }
+
+  ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
+                    std::move(Stages));
+  if (EmitTestAnnotations) {
+    assert(NewInstrChanges.empty() &&
+           "Cannot serialize a schedule with InstrChanges!");
+    ModuloScheduleTestAnnotater MSTI(MF, MS);
+    MSTI.annotate();
+    return;
+  }
+  // The experimental code generator can't work if there are InstChanges.
+  if (ExperimentalCodeGen && NewInstrChanges.empty()) {
+    PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
+    MSE.expand();
+  } else {
+    ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
+    MSE.expand();
+    MSE.cleanup();
+  }
   ++NumPipelined;
 }
 
 /// Clean up after the software pipeliner runs.
 void SwingSchedulerDAG::finishBlock() {
-  for (MachineInstr *I : NewMIs)
-    MF.DeleteMachineInstr(I);
+  for (auto &KV : NewMIs)
+    MF.DeleteMachineInstr(KV.second);
   NewMIs.clear();
 
   // Call the superclass.
@@ -546,14 +593,6 @@ static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
   assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
 }
 
-/// Return the Phi register value that comes from the incoming block.
-static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
-  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
-    if (Phi.getOperand(i + 1).getMBB() != LoopBB)
-      return Phi.getOperand(i).getReg();
-  return 0;
-}
-
 /// Return the Phi register value that comes the loop block.
 static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
@@ -658,7 +697,7 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
               TII->getMemOperandWithOffset(MI, BaseOp2, Offset2, TRI)) {
             if (BaseOp1->isIdenticalTo(*BaseOp2) &&
                 (int)Offset1 < (int)Offset2) {
-              assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI, AA) &&
+              assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) &&
                      "What happened to the chain edge?");
               SDep Dep(Load, SDep::Barrier);
               Dep.setLatency(1);
@@ -730,7 +769,7 @@ void SwingSchedulerDAG::updatePhiDependences() {
          MOI != MOE; ++MOI) {
       if (!MOI->isReg())
         continue;
-      unsigned Reg = MOI->getReg();
+      Register Reg = MOI->getReg();
       if (MOI->isDef()) {
         // If the register is used by a Phi, then create an anti dependence.
         for (MachineRegisterInfo::use_instr_iterator
@@ -809,7 +848,7 @@ void SwingSchedulerDAG::changeDependences() {
       continue;
 
     // Get the MI and SUnit for the instruction that defines the original base.
-    unsigned OrigBase = I.getInstr()->getOperand(BasePos).getReg();
+    Register OrigBase = I.getInstr()->getOperand(BasePos).getReg();
     MachineInstr *DefMI = MRI.getUniqueVRegDef(OrigBase);
     if (!DefMI)
       continue;
@@ -958,7 +997,7 @@ struct FuncUnitSorter {
     unsigned F1 = 0, F2 = 0;
     unsigned MFUs1 = minFuncUnits(IS1, F1);
     unsigned MFUs2 = minFuncUnits(IS2, F2);
-    if (MFUs1 == 1 && MFUs2 == 1)
+    if (MFUs1 == MFUs2)
       return Resources.lookup(F1) < Resources.lookup(F2);
     return MFUs1 > MFUs2;
   }
@@ -1514,8 +1553,8 @@ static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
       continue;
     for (const MachineOperand &MO : MI->operands())
       if (MO.isReg() && MO.isUse()) {
-        unsigned Reg = MO.getReg();
-        if (TargetRegisterInfo::isVirtualRegister(Reg))
+        Register Reg = MO.getReg();
+        if (Register::isVirtualRegister(Reg))
           Uses.insert(Reg);
         else if (MRI.isAllocatable(Reg))
           for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
@@ -1525,8 +1564,8 @@ static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
   for (SUnit *SU : NS)
     for (const MachineOperand &MO : SU->getInstr()->operands())
       if (MO.isReg() && MO.isDef() && !MO.isDead()) {
-        unsigned Reg = MO.getReg();
-        if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        Register Reg = MO.getReg();
+        if (Register::isVirtualRegister(Reg)) {
           if (!Uses.count(Reg))
             LiveOutRegs.push_back(RegisterMaskPair(Reg,
                                                    LaneBitmask::getNone()));
@@ -2012,836 +2051,6 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
   return scheduleFound && Schedule.getMaxStageCount() > 0;
 }
 
-/// Given a schedule for the loop, generate a new version of the loop,
-/// and replace the old version.  This function generates a prolog
-/// that contains the initial iterations in the pipeline, and kernel
-/// loop, and the epilogue that contains the code for the final
-/// iterations.
-void SwingSchedulerDAG::generatePipelinedLoop(SMSchedule &Schedule) {
-  // Create a new basic block for the kernel and add it to the CFG.
-  MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
-
-  unsigned MaxStageCount = Schedule.getMaxStageCount();
-
-  // Remember the registers that are used in different stages. The index is
-  // the iteration, or stage, that the instruction is scheduled in.  This is
-  // a map between register names in the original block and the names created
-  // in each stage of the pipelined loop.
-  ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
-  InstrMapTy InstrMap;
-
-  SmallVector<MachineBasicBlock *, 4> PrologBBs;
-
-  MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
-  assert(PreheaderBB != nullptr &&
-         "Need to add code to handle loops w/o preheader");
-  // Generate the prolog instructions that set up the pipeline.
-  generateProlog(Schedule, MaxStageCount, KernelBB, VRMap, PrologBBs);
-  MF.insert(BB->getIterator(), KernelBB);
-
-  // Rearrange the instructions to generate the new, pipelined loop,
-  // and update register names as needed.
-  for (int Cycle = Schedule.getFirstCycle(),
-           LastCycle = Schedule.getFinalCycle();
-       Cycle <= LastCycle; ++Cycle) {
-    std::deque<SUnit *> &CycleInstrs = Schedule.getInstructions(Cycle);
-    // This inner loop schedules each instruction in the cycle.
-    for (SUnit *CI : CycleInstrs) {
-      if (CI->getInstr()->isPHI())
-        continue;
-      unsigned StageNum = Schedule.stageScheduled(getSUnit(CI->getInstr()));
-      MachineInstr *NewMI = cloneInstr(CI->getInstr(), MaxStageCount, StageNum);
-      updateInstruction(NewMI, false, MaxStageCount, StageNum, Schedule, VRMap);
-      KernelBB->push_back(NewMI);
-      InstrMap[NewMI] = CI->getInstr();
-    }
-  }
-
-  // Copy any terminator instructions to the new kernel, and update
-  // names as needed.
-  for (MachineBasicBlock::iterator I = BB->getFirstTerminator(),
-                                   E = BB->instr_end();
-       I != E; ++I) {
-    MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
-    updateInstruction(NewMI, false, MaxStageCount, 0, Schedule, VRMap);
-    KernelBB->push_back(NewMI);
-    InstrMap[NewMI] = &*I;
-  }
-
-  KernelBB->transferSuccessors(BB);
-  KernelBB->replaceSuccessor(BB, KernelBB);
-
-  generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule,
-                       VRMap, InstrMap, MaxStageCount, MaxStageCount, false);
-  generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, Schedule, VRMap,
-               InstrMap, MaxStageCount, MaxStageCount, false);
-
-  LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
-
-  SmallVector<MachineBasicBlock *, 4> EpilogBBs;
-  // Generate the epilog instructions to complete the pipeline.
-  generateEpilog(Schedule, MaxStageCount, KernelBB, VRMap, EpilogBBs,
-                 PrologBBs);
-
-  // We need this step because the register allocation doesn't handle some
-  // situations well, so we insert copies to help out.
-  splitLifetimes(KernelBB, EpilogBBs, Schedule);
-
-  // Remove dead instructions due to loop induction variables.
-  removeDeadInstructions(KernelBB, EpilogBBs);
-
-  // Add branches between prolog and epilog blocks.
-  addBranches(*PreheaderBB, PrologBBs, KernelBB, EpilogBBs, Schedule, VRMap);
-
-  // Remove the original loop since it's no longer referenced.
-  for (auto &I : *BB)
-    LIS.RemoveMachineInstrFromMaps(I);
-  BB->clear();
-  BB->eraseFromParent();
-
-  delete[] VRMap;
-}
-
-/// Generate the pipeline prolog code.
-void SwingSchedulerDAG::generateProlog(SMSchedule &Schedule, unsigned LastStage,
-                                       MachineBasicBlock *KernelBB,
-                                       ValueMapTy *VRMap,
-                                       MBBVectorTy &PrologBBs) {
-  MachineBasicBlock *PreheaderBB = MLI->getLoopFor(BB)->getLoopPreheader();
-  assert(PreheaderBB != nullptr &&
-         "Need to add code to handle loops w/o preheader");
-  MachineBasicBlock *PredBB = PreheaderBB;
-  InstrMapTy InstrMap;
-
-  // Generate a basic block for each stage, not including the last stage,
-  // which will be generated in the kernel. Each basic block may contain
-  // instructions from multiple stages/iterations.
-  for (unsigned i = 0; i < LastStage; ++i) {
-    // Create and insert the prolog basic block prior to the original loop
-    // basic block.  The original loop is removed later.
-    MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
-    PrologBBs.push_back(NewBB);
-    MF.insert(BB->getIterator(), NewBB);
-    NewBB->transferSuccessors(PredBB);
-    PredBB->addSuccessor(NewBB);
-    PredBB = NewBB;
-
-    // Generate instructions for each appropriate stage. Process instructions
-    // in original program order.
-    for (int StageNum = i; StageNum >= 0; --StageNum) {
-      for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
-                                       BBE = BB->getFirstTerminator();
-           BBI != BBE; ++BBI) {
-        if (Schedule.isScheduledAtStage(getSUnit(&*BBI), (unsigned)StageNum)) {
-          if (BBI->isPHI())
-            continue;
-          MachineInstr *NewMI =
-              cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum, Schedule);
-          updateInstruction(NewMI, false, i, (unsigned)StageNum, Schedule,
-                            VRMap);
-          NewBB->push_back(NewMI);
-          InstrMap[NewMI] = &*BBI;
-        }
-      }
-    }
-    rewritePhiValues(NewBB, i, Schedule, VRMap, InstrMap);
-    LLVM_DEBUG({
-      dbgs() << "prolog:\n";
-      NewBB->dump();
-    });
-  }
-
-  PredBB->replaceSuccessor(BB, KernelBB);
-
-  // Check if we need to remove the branch from the preheader to the original
-  // loop, and replace it with a branch to the new loop.
-  unsigned numBranches = TII->removeBranch(*PreheaderBB);
-  if (numBranches) {
-    SmallVector<MachineOperand, 0> Cond;
-    TII->insertBranch(*PreheaderBB, PrologBBs[0], nullptr, Cond, DebugLoc());
-  }
-}
-
-/// Generate the pipeline epilog code. The epilog code finishes the iterations
-/// that were started in either the prolog or the kernel.  We create a basic
-/// block for each stage that needs to complete.
-void SwingSchedulerDAG::generateEpilog(SMSchedule &Schedule, unsigned LastStage,
-                                       MachineBasicBlock *KernelBB,
-                                       ValueMapTy *VRMap,
-                                       MBBVectorTy &EpilogBBs,
-                                       MBBVectorTy &PrologBBs) {
-  // We need to change the branch from the kernel to the first epilog block, so
-  // this call to analyze branch uses the kernel rather than the original BB.
-  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
-  SmallVector<MachineOperand, 4> Cond;
-  bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
-  assert(!checkBranch && "generateEpilog must be able to analyze the branch");
-  if (checkBranch)
-    return;
-
-  MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
-  if (*LoopExitI == KernelBB)
-    ++LoopExitI;
-  assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
-  MachineBasicBlock *LoopExitBB = *LoopExitI;
-
-  MachineBasicBlock *PredBB = KernelBB;
-  MachineBasicBlock *EpilogStart = LoopExitBB;
-  InstrMapTy InstrMap;
-
-  // Generate a basic block for each stage, not including the last stage,
-  // which was generated for the kernel.  Each basic block may contain
-  // instructions from multiple stages/iterations.
-  int EpilogStage = LastStage + 1;
-  for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
-    MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
-    EpilogBBs.push_back(NewBB);
-    MF.insert(BB->getIterator(), NewBB);
-
-    PredBB->replaceSuccessor(LoopExitBB, NewBB);
-    NewBB->addSuccessor(LoopExitBB);
-
-    if (EpilogStart == LoopExitBB)
-      EpilogStart = NewBB;
-
-    // Add instructions to the epilog depending on the current block.
-    // Process instructions in original program order.
-    for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
-      for (auto &BBI : *BB) {
-        if (BBI.isPHI())
-          continue;
-        MachineInstr *In = &BBI;
-        if (Schedule.isScheduledAtStage(getSUnit(In), StageNum)) {
-          // Instructions with memoperands in the epilog are updated with
-          // conservative values.
-          MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
-          updateInstruction(NewMI, i == 1, EpilogStage, 0, Schedule, VRMap);
-          NewBB->push_back(NewMI);
-          InstrMap[NewMI] = In;
-        }
-      }
-    }
-    generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule,
-                         VRMap, InstrMap, LastStage, EpilogStage, i == 1);
-    generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, Schedule, VRMap,
-                 InstrMap, LastStage, EpilogStage, i == 1);
-    PredBB = NewBB;
-
-    LLVM_DEBUG({
-      dbgs() << "epilog:\n";
-      NewBB->dump();
-    });
-  }
-
-  // Fix any Phi nodes in the loop exit block.
-  for (MachineInstr &MI : *LoopExitBB) {
-    if (!MI.isPHI())
-      break;
-    for (unsigned i = 2, e = MI.getNumOperands() + 1; i != e; i += 2) {
-      MachineOperand &MO = MI.getOperand(i);
-      if (MO.getMBB() == BB)
-        MO.setMBB(PredBB);
-    }
-  }
-
-  // Create a branch to the new epilog from the kernel.
-  // Remove the original branch and add a new branch to the epilog.
-  TII->removeBranch(*KernelBB);
-  TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
-  // Add a branch to the loop exit.
-  if (EpilogBBs.size() > 0) {
-    MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
-    SmallVector<MachineOperand, 4> Cond1;
-    TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
-  }
-}
-
-/// Replace all uses of FromReg that appear outside the specified
-/// basic block with ToReg.
-static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
-                                    MachineBasicBlock *MBB,
-                                    MachineRegisterInfo &MRI,
-                                    LiveIntervals &LIS) {
-  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
-                                         E = MRI.use_end();
-       I != E;) {
-    MachineOperand &O = *I;
-    ++I;
-    if (O.getParent()->getParent() != MBB)
-      O.setReg(ToReg);
-  }
-  if (!LIS.hasInterval(ToReg))
-    LIS.createEmptyInterval(ToReg);
-}
-
-/// Return true if the register has a use that occurs outside the
-/// specified loop.
-static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
-                            MachineRegisterInfo &MRI) {
-  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
-                                         E = MRI.use_end();
-       I != E; ++I)
-    if (I->getParent()->getParent() != BB)
-      return true;
-  return false;
-}
-
-/// Generate Phis for the specific block in the generated pipelined code.
-/// This function looks at the Phis from the original code to guide the
-/// creation of new Phis.
-void SwingSchedulerDAG::generateExistingPhis(
-    MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
-    MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
-    InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
-    bool IsLast) {
-  // Compute the stage number for the initial value of the Phi, which
-  // comes from the prolog. The prolog to use depends on to which kernel/
-  // epilog that we're adding the Phi.
-  unsigned PrologStage = 0;
-  unsigned PrevStage = 0;
-  bool InKernel = (LastStageNum == CurStageNum);
-  if (InKernel) {
-    PrologStage = LastStageNum - 1;
-    PrevStage = CurStageNum;
-  } else {
-    PrologStage = LastStageNum - (CurStageNum - LastStageNum);
-    PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
-  }
-
-  for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
-                                   BBE = BB->getFirstNonPHI();
-       BBI != BBE; ++BBI) {
-    unsigned Def = BBI->getOperand(0).getReg();
-
-    unsigned InitVal = 0;
-    unsigned LoopVal = 0;
-    getPhiRegs(*BBI, BB, InitVal, LoopVal);
-
-    unsigned PhiOp1 = 0;
-    // The Phi value from the loop body typically is defined in the loop, but
-    // not always. So, we need to check if the value is defined in the loop.
-    unsigned PhiOp2 = LoopVal;
-    if (VRMap[LastStageNum].count(LoopVal))
-      PhiOp2 = VRMap[LastStageNum][LoopVal];
-
-    int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
-    int LoopValStage =
-        Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
-    unsigned NumStages = Schedule.getStagesForReg(Def, CurStageNum);
-    if (NumStages == 0) {
-      // We don't need to generate a Phi anymore, but we need to rename any uses
-      // of the Phi value.
-      unsigned NewReg = VRMap[PrevStage][LoopVal];
-      rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, 0, &*BBI,
-                            Def, InitVal, NewReg);
-      if (VRMap[CurStageNum].count(LoopVal))
-        VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
-    }
-    // Adjust the number of Phis needed depending on the number of prologs left,
-    // and the distance from where the Phi is first scheduled. The number of
-    // Phis cannot exceed the number of prolog stages. Each stage can
-    // potentially define two values.
-    unsigned MaxPhis = PrologStage + 2;
-    if (!InKernel && (int)PrologStage <= LoopValStage)
-      MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
-    unsigned NumPhis = std::min(NumStages, MaxPhis);
-
-    unsigned NewReg = 0;
-    unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
-    // In the epilog, we may need to look back one stage to get the correct
-    // Phi name because the epilog and prolog blocks execute the same stage.
-    // The correct name is from the previous block only when the Phi has
-    // been completely scheduled prior to the epilog, and Phi value is not
-    // needed in multiple stages.
-    int StageDiff = 0;
-    if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
-        NumPhis == 1)
-      StageDiff = 1;
-    // Adjust the computations below when the phi and the loop definition
-    // are scheduled in different stages.
-    if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
-      StageDiff = StageScheduled - LoopValStage;
-    for (unsigned np = 0; np < NumPhis; ++np) {
-      // If the Phi hasn't been scheduled, then use the initial Phi operand
-      // value. Otherwise, use the scheduled version of the instruction. This
-      // is a little complicated when a Phi references another Phi.
-      if (np > PrologStage || StageScheduled >= (int)LastStageNum)
-        PhiOp1 = InitVal;
-      // Check if the Phi has already been scheduled in a prolog stage.
-      else if (PrologStage >= AccessStage + StageDiff + np &&
-               VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
-        PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
-      // Check if the Phi has already been scheduled, but the loop instruction
-      // is either another Phi, or doesn't occur in the loop.
-      else if (PrologStage >= AccessStage + StageDiff + np) {
-        // If the Phi references another Phi, we need to examine the other
-        // Phi to get the correct value.
-        PhiOp1 = LoopVal;
-        MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
-        int Indirects = 1;
-        while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
-          int PhiStage = Schedule.stageScheduled(getSUnit(InstOp1));
-          if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
-            PhiOp1 = getInitPhiReg(*InstOp1, BB);
-          else
-            PhiOp1 = getLoopPhiReg(*InstOp1, BB);
-          InstOp1 = MRI.getVRegDef(PhiOp1);
-          int PhiOpStage = Schedule.stageScheduled(getSUnit(InstOp1));
-          int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
-          if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
-              VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
-            PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
-            break;
-          }
-          ++Indirects;
-        }
-      } else
-        PhiOp1 = InitVal;
-      // If this references a generated Phi in the kernel, get the Phi operand
-      // from the incoming block.
-      if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
-        if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
-          PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
-
-      MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
-      bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
-      // In the epilog, a map lookup is needed to get the value from the kernel,
-      // or previous epilog block. How is does this depends on if the
-      // instruction is scheduled in the previous block.
-      if (!InKernel) {
-        int StageDiffAdj = 0;
-        if (LoopValStage != -1 && StageScheduled > LoopValStage)
-          StageDiffAdj = StageScheduled - LoopValStage;
-        // Use the loop value defined in the kernel, unless the kernel
-        // contains the last definition of the Phi.
-        if (np == 0 && PrevStage == LastStageNum &&
-            (StageScheduled != 0 || LoopValStage != 0) &&
-            VRMap[PrevStage - StageDiffAdj].count(LoopVal))
-          PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
-        // Use the value defined by the Phi. We add one because we switch
-        // from looking at the loop value to the Phi definition.
-        else if (np > 0 && PrevStage == LastStageNum &&
-                 VRMap[PrevStage - np + 1].count(Def))
-          PhiOp2 = VRMap[PrevStage - np + 1][Def];
-        // Use the loop value defined in the kernel.
-        else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
-                 VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
-          PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
-        // Use the value defined by the Phi, unless we're generating the first
-        // epilog and the Phi refers to a Phi in a different stage.
-        else if (VRMap[PrevStage - np].count(Def) &&
-                 (!LoopDefIsPhi || (PrevStage != LastStageNum) || (LoopValStage == StageScheduled)))
-          PhiOp2 = VRMap[PrevStage - np][Def];
-      }
-
-      // Check if we can reuse an existing Phi. This occurs when a Phi
-      // references another Phi, and the other Phi is scheduled in an
-      // earlier stage. We can try to reuse an existing Phi up until the last
-      // stage of the current Phi.
-      if (LoopDefIsPhi) {
-        if (static_cast<int>(PrologStage - np) >= StageScheduled) {
-          int LVNumStages = Schedule.getStagesForPhi(LoopVal);
-          int StageDiff = (StageScheduled - LoopValStage);
-          LVNumStages -= StageDiff;
-          // Make sure the loop value Phi has been processed already.
-          if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
-            NewReg = PhiOp2;
-            unsigned ReuseStage = CurStageNum;
-            if (Schedule.isLoopCarried(this, *PhiInst))
-              ReuseStage -= LVNumStages;
-            // Check if the Phi to reuse has been generated yet. If not, then
-            // there is nothing to reuse.
-            if (VRMap[ReuseStage - np].count(LoopVal)) {
-              NewReg = VRMap[ReuseStage - np][LoopVal];
-
-              rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
-                                    &*BBI, Def, NewReg);
-              // Update the map with the new Phi name.
-              VRMap[CurStageNum - np][Def] = NewReg;
-              PhiOp2 = NewReg;
-              if (VRMap[LastStageNum - np - 1].count(LoopVal))
-                PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
-
-              if (IsLast && np == NumPhis - 1)
-                replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
-              continue;
-            }
-          }
-        }
-        if (InKernel && StageDiff > 0 &&
-            VRMap[CurStageNum - StageDiff - np].count(LoopVal))
-          PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
-      }
-
-      const TargetRegisterClass *RC = MRI.getRegClass(Def);
-      NewReg = MRI.createVirtualRegister(RC);
-
-      MachineInstrBuilder NewPhi =
-          BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
-                  TII->get(TargetOpcode::PHI), NewReg);
-      NewPhi.addReg(PhiOp1).addMBB(BB1);
-      NewPhi.addReg(PhiOp2).addMBB(BB2);
-      if (np == 0)
-        InstrMap[NewPhi] = &*BBI;
-
-      // We define the Phis after creating the new pipelined code, so
-      // we need to rename the Phi values in scheduled instructions.
-
-      unsigned PrevReg = 0;
-      if (InKernel && VRMap[PrevStage - np].count(LoopVal))
-        PrevReg = VRMap[PrevStage - np][LoopVal];
-      rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
-                            Def, NewReg, PrevReg);
-      // If the Phi has been scheduled, use the new name for rewriting.
-      if (VRMap[CurStageNum - np].count(Def)) {
-        unsigned R = VRMap[CurStageNum - np][Def];
-        rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np, &*BBI,
-                              R, NewReg);
-      }
-
-      // Check if we need to rename any uses that occurs after the loop. The
-      // register to replace depends on whether the Phi is scheduled in the
-      // epilog.
-      if (IsLast && np == NumPhis - 1)
-        replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
-
-      // In the kernel, a dependent Phi uses the value from this Phi.
-      if (InKernel)
-        PhiOp2 = NewReg;
-
-      // Update the map with the new Phi name.
-      VRMap[CurStageNum - np][Def] = NewReg;
-    }
-
-    while (NumPhis++ < NumStages) {
-      rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, NumPhis,
-                            &*BBI, Def, NewReg, 0);
-    }
-
-    // Check if we need to rename a Phi that has been eliminated due to
-    // scheduling.
-    if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
-      replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
-  }
-}
-
-/// Generate Phis for the specified block in the generated pipelined code.
-/// These are new Phis needed because the definition is scheduled after the
-/// use in the pipelined sequence.
-void SwingSchedulerDAG::generatePhis(
-    MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
-    MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
-    InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
-    bool IsLast) {
-  // Compute the stage number that contains the initial Phi value, and
-  // the Phi from the previous stage.
-  unsigned PrologStage = 0;
-  unsigned PrevStage = 0;
-  unsigned StageDiff = CurStageNum - LastStageNum;
-  bool InKernel = (StageDiff == 0);
-  if (InKernel) {
-    PrologStage = LastStageNum - 1;
-    PrevStage = CurStageNum;
-  } else {
-    PrologStage = LastStageNum - StageDiff;
-    PrevStage = LastStageNum + StageDiff - 1;
-  }
-
-  for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
-                                   BBE = BB->instr_end();
-       BBI != BBE; ++BBI) {
-    for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = BBI->getOperand(i);
-      if (!MO.isReg() || !MO.isDef() ||
-          !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-        continue;
-
-      int StageScheduled = Schedule.stageScheduled(getSUnit(&*BBI));
-      assert(StageScheduled != -1 && "Expecting scheduled instruction.");
-      unsigned Def = MO.getReg();
-      unsigned NumPhis = Schedule.getStagesForReg(Def, CurStageNum);
-      // An instruction scheduled in stage 0 and is used after the loop
-      // requires a phi in the epilog for the last definition from either
-      // the kernel or prolog.
-      if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
-          hasUseAfterLoop(Def, BB, MRI))
-        NumPhis = 1;
-      if (!InKernel && (unsigned)StageScheduled > PrologStage)
-        continue;
-
-      unsigned PhiOp2 = VRMap[PrevStage][Def];
-      if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
-        if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
-          PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
-      // The number of Phis can't exceed the number of prolog stages. The
-      // prolog stage number is zero based.
-      if (NumPhis > PrologStage + 1 - StageScheduled)
-        NumPhis = PrologStage + 1 - StageScheduled;
-      for (unsigned np = 0; np < NumPhis; ++np) {
-        unsigned PhiOp1 = VRMap[PrologStage][Def];
-        if (np <= PrologStage)
-          PhiOp1 = VRMap[PrologStage - np][Def];
-        if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1)) {
-          if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
-            PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
-          if (InstOp1->isPHI() && InstOp1->getParent() == NewBB)
-            PhiOp1 = getInitPhiReg(*InstOp1, NewBB);
-        }
-        if (!InKernel)
-          PhiOp2 = VRMap[PrevStage - np][Def];
-
-        const TargetRegisterClass *RC = MRI.getRegClass(Def);
-        unsigned NewReg = MRI.createVirtualRegister(RC);
-
-        MachineInstrBuilder NewPhi =
-            BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
-                    TII->get(TargetOpcode::PHI), NewReg);
-        NewPhi.addReg(PhiOp1).addMBB(BB1);
-        NewPhi.addReg(PhiOp2).addMBB(BB2);
-        if (np == 0)
-          InstrMap[NewPhi] = &*BBI;
-
-        // Rewrite uses and update the map. The actions depend upon whether
-        // we generating code for the kernel or epilog blocks.
-        if (InKernel) {
-          rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
-                                &*BBI, PhiOp1, NewReg);
-          rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
-                                &*BBI, PhiOp2, NewReg);
-
-          PhiOp2 = NewReg;
-          VRMap[PrevStage - np - 1][Def] = NewReg;
-        } else {
-          VRMap[CurStageNum - np][Def] = NewReg;
-          if (np == NumPhis - 1)
-            rewriteScheduledInstr(NewBB, Schedule, InstrMap, CurStageNum, np,
-                                  &*BBI, Def, NewReg);
-        }
-        if (IsLast && np == NumPhis - 1)
-          replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
-      }
-    }
-  }
-}
-
-/// Remove instructions that generate values with no uses.
-/// Typically, these are induction variable operations that generate values
-/// used in the loop itself.  A dead instruction has a definition with
-/// no uses, or uses that occur in the original loop only.
-void SwingSchedulerDAG::removeDeadInstructions(MachineBasicBlock *KernelBB,
-                                               MBBVectorTy &EpilogBBs) {
-  // For each epilog block, check that the value defined by each instruction
-  // is used.  If not, delete it.
-  for (MBBVectorTy::reverse_iterator MBB = EpilogBBs.rbegin(),
-                                     MBE = EpilogBBs.rend();
-       MBB != MBE; ++MBB)
-    for (MachineBasicBlock::reverse_instr_iterator MI = (*MBB)->instr_rbegin(),
-                                                   ME = (*MBB)->instr_rend();
-         MI != ME;) {
-      // From DeadMachineInstructionElem. Don't delete inline assembly.
-      if (MI->isInlineAsm()) {
-        ++MI;
-        continue;
-      }
-      bool SawStore = false;
-      // Check if it's safe to remove the instruction due to side effects.
-      // We can, and want to, remove Phis here.
-      if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
-        ++MI;
-        continue;
-      }
-      bool used = true;
-      for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
-                                      MOE = MI->operands_end();
-           MOI != MOE; ++MOI) {
-        if (!MOI->isReg() || !MOI->isDef())
-          continue;
-        unsigned reg = MOI->getReg();
-        // Assume physical registers are used, unless they are marked dead.
-        if (TargetRegisterInfo::isPhysicalRegister(reg)) {
-          used = !MOI->isDead();
-          if (used)
-            break;
-          continue;
-        }
-        unsigned realUses = 0;
-        for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg),
-                                               EI = MRI.use_end();
-             UI != EI; ++UI) {
-          // Check if there are any uses that occur only in the original
-          // loop.  If so, that's not a real use.
-          if (UI->getParent()->getParent() != BB) {
-            realUses++;
-            used = true;
-            break;
-          }
-        }
-        if (realUses > 0)
-          break;
-        used = false;
-      }
-      if (!used) {
-        LIS.RemoveMachineInstrFromMaps(*MI);
-        MI++->eraseFromParent();
-        continue;
-      }
-      ++MI;
-    }
-  // In the kernel block, check if we can remove a Phi that generates a value
-  // used in an instruction removed in the epilog block.
-  for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
-                                   BBE = KernelBB->getFirstNonPHI();
-       BBI != BBE;) {
-    MachineInstr *MI = &*BBI;
-    ++BBI;
-    unsigned reg = MI->getOperand(0).getReg();
-    if (MRI.use_begin(reg) == MRI.use_end()) {
-      LIS.RemoveMachineInstrFromMaps(*MI);
-      MI->eraseFromParent();
-    }
-  }
-}
-
-/// For loop carried definitions, we split the lifetime of a virtual register
-/// that has uses past the definition in the next iteration. A copy with a new
-/// virtual register is inserted before the definition, which helps with
-/// generating a better register assignment.
-///
-///   v1 = phi(a, v2)     v1 = phi(a, v2)
-///   v2 = phi(b, v3)     v2 = phi(b, v3)
-///   v3 = ..             v4 = copy v1
-///   .. = V1             v3 = ..
-///                       .. = v4
-void SwingSchedulerDAG::splitLifetimes(MachineBasicBlock *KernelBB,
-                                       MBBVectorTy &EpilogBBs,
-                                       SMSchedule &Schedule) {
-  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  for (auto &PHI : KernelBB->phis()) {
-    unsigned Def = PHI.getOperand(0).getReg();
-    // Check for any Phi definition that used as an operand of another Phi
-    // in the same block.
-    for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
-                                                 E = MRI.use_instr_end();
-         I != E; ++I) {
-      if (I->isPHI() && I->getParent() == KernelBB) {
-        // Get the loop carried definition.
-        unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
-        if (!LCDef)
-          continue;
-        MachineInstr *MI = MRI.getVRegDef(LCDef);
-        if (!MI || MI->getParent() != KernelBB || MI->isPHI())
-          continue;
-        // Search through the rest of the block looking for uses of the Phi
-        // definition. If one occurs, then split the lifetime.
-        unsigned SplitReg = 0;
-        for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
-                                    KernelBB->instr_end()))
-          if (BBJ.readsRegister(Def)) {
-            // We split the lifetime when we find the first use.
-            if (SplitReg == 0) {
-              SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
-              BuildMI(*KernelBB, MI, MI->getDebugLoc(),
-                      TII->get(TargetOpcode::COPY), SplitReg)
-                  .addReg(Def);
-            }
-            BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
-          }
-        if (!SplitReg)
-          continue;
-        // Search through each of the epilog blocks for any uses to be renamed.
-        for (auto &Epilog : EpilogBBs)
-          for (auto &I : *Epilog)
-            if (I.readsRegister(Def))
-              I.substituteRegister(Def, SplitReg, 0, *TRI);
-        break;
-      }
-    }
-  }
-}
-
-/// Remove the incoming block from the Phis in a basic block.
-static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
-  for (MachineInstr &MI : *BB) {
-    if (!MI.isPHI())
-      break;
-    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
-      if (MI.getOperand(i + 1).getMBB() == Incoming) {
-        MI.RemoveOperand(i + 1);
-        MI.RemoveOperand(i);
-        break;
-      }
-  }
-}
-
-/// Create branches from each prolog basic block to the appropriate epilog
-/// block.  These edges are needed if the loop ends before reaching the
-/// kernel.
-void SwingSchedulerDAG::addBranches(MachineBasicBlock &PreheaderBB,
-                                    MBBVectorTy &PrologBBs,
-                                    MachineBasicBlock *KernelBB,
-                                    MBBVectorTy &EpilogBBs,
-                                    SMSchedule &Schedule, ValueMapTy *VRMap) {
-  assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
-  MachineInstr *IndVar = Pass.LI.LoopInductionVar;
-  MachineInstr *Cmp = Pass.LI.LoopCompare;
-  MachineBasicBlock *LastPro = KernelBB;
-  MachineBasicBlock *LastEpi = KernelBB;
-
-  // Start from the blocks connected to the kernel and work "out"
-  // to the first prolog and the last epilog blocks.
-  SmallVector<MachineInstr *, 4> PrevInsts;
-  unsigned MaxIter = PrologBBs.size() - 1;
-  unsigned LC = UINT_MAX;
-  unsigned LCMin = UINT_MAX;
-  for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
-    // Add branches to the prolog that go to the corresponding
-    // epilog, and the fall-thru prolog/kernel block.
-    MachineBasicBlock *Prolog = PrologBBs[j];
-    MachineBasicBlock *Epilog = EpilogBBs[i];
-    // We've executed one iteration, so decrement the loop count and check for
-    // the loop end.
-    SmallVector<MachineOperand, 4> Cond;
-    // Check if the LOOP0 has already been removed. If so, then there is no need
-    // to reduce the trip count.
-    if (LC != 0)
-      LC = TII->reduceLoopCount(*Prolog, PreheaderBB, IndVar, *Cmp, Cond,
-                                PrevInsts, j, MaxIter);
-
-    // Record the value of the first trip count, which is used to determine if
-    // branches and blocks can be removed for constant trip counts.
-    if (LCMin == UINT_MAX)
-      LCMin = LC;
-
-    unsigned numAdded = 0;
-    if (TargetRegisterInfo::isVirtualRegister(LC)) {
-      Prolog->addSuccessor(Epilog);
-      numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
-    } else if (j >= LCMin) {
-      Prolog->addSuccessor(Epilog);
-      Prolog->removeSuccessor(LastPro);
-      LastEpi->removeSuccessor(Epilog);
-      numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
-      removePhis(Epilog, LastEpi);
-      // Remove the blocks that are no longer referenced.
-      if (LastPro != LastEpi) {
-        LastEpi->clear();
-        LastEpi->eraseFromParent();
-      }
-      LastPro->clear();
-      LastPro->eraseFromParent();
-    } else {
-      numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
-      removePhis(Epilog, Prolog);
-    }
-    LastPro = Prolog;
-    LastEpi = Epilog;
-    for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
-                                                   E = Prolog->instr_rend();
-         I != E && numAdded > 0; ++I, --numAdded)
-      updateInstruction(&*I, false, j, 0, Schedule, VRMap);
-  }
-}
-
 /// Return true if we can compute the amount the instruction changes
 /// during each iteration. Set Delta to the amount of the change.
 bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
@@ -2854,7 +2063,7 @@ bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
   if (!BaseOp->isReg())
     return false;
 
-  unsigned BaseReg = BaseOp->getReg();
+  Register BaseReg = BaseOp->getReg();
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
   // Check if there is a Phi. If so, get the definition in the loop.
@@ -2874,261 +2083,6 @@ bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
   return true;
 }
 
-/// Update the memory operand with a new offset when the pipeliner
-/// generates a new copy of the instruction that refers to a
-/// different memory location.
-void SwingSchedulerDAG::updateMemOperands(MachineInstr &NewMI,
-                                          MachineInstr &OldMI, unsigned Num) {
-  if (Num == 0)
-    return;
-  // If the instruction has memory operands, then adjust the offset
-  // when the instruction appears in different stages.
-  if (NewMI.memoperands_empty())
-    return;
-  SmallVector<MachineMemOperand *, 2> NewMMOs;
-  for (MachineMemOperand *MMO : NewMI.memoperands()) {
-    // TODO: Figure out whether isAtomic is really necessary (see D57601).
-    if (MMO->isVolatile() || MMO->isAtomic() ||
-        (MMO->isInvariant() && MMO->isDereferenceable()) ||
-        (!MMO->getValue())) {
-      NewMMOs.push_back(MMO);
-      continue;
-    }
-    unsigned Delta;
-    if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
-      int64_t AdjOffset = Delta * Num;
-      NewMMOs.push_back(
-          MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
-    } else {
-      NewMMOs.push_back(
-          MF.getMachineMemOperand(MMO, 0, MemoryLocation::UnknownSize));
-    }
-  }
-  NewMI.setMemRefs(MF, NewMMOs);
-}
-
-/// Clone the instruction for the new pipelined loop and update the
-/// memory operands, if needed.
-MachineInstr *SwingSchedulerDAG::cloneInstr(MachineInstr *OldMI,
-                                            unsigned CurStageNum,
-                                            unsigned InstStageNum) {
-  MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
-  // Check for tied operands in inline asm instructions. This should be handled
-  // elsewhere, but I'm not sure of the best solution.
-  if (OldMI->isInlineAsm())
-    for (unsigned i = 0, e = OldMI->getNumOperands(); i != e; ++i) {
-      const auto &MO = OldMI->getOperand(i);
-      if (MO.isReg() && MO.isUse())
-        break;
-      unsigned UseIdx;
-      if (OldMI->isRegTiedToUseOperand(i, &UseIdx))
-        NewMI->tieOperands(i, UseIdx);
-    }
-  updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
-  return NewMI;
-}
-
-/// Clone the instruction for the new pipelined loop. If needed, this
-/// function updates the instruction using the values saved in the
-/// InstrChanges structure.
-MachineInstr *SwingSchedulerDAG::cloneAndChangeInstr(MachineInstr *OldMI,
-                                                     unsigned CurStageNum,
-                                                     unsigned InstStageNum,
-                                                     SMSchedule &Schedule) {
-  MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
-  DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
-      InstrChanges.find(getSUnit(OldMI));
-  if (It != InstrChanges.end()) {
-    std::pair<unsigned, int64_t> RegAndOffset = It->second;
-    unsigned BasePos, OffsetPos;
-    if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
-      return nullptr;
-    int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
-    MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
-    if (Schedule.stageScheduled(getSUnit(LoopDef)) > (signed)InstStageNum)
-      NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
-    NewMI->getOperand(OffsetPos).setImm(NewOffset);
-  }
-  updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
-  return NewMI;
-}
-
-/// Update the machine instruction with new virtual registers.  This
-/// function may change the defintions and/or uses.
-void SwingSchedulerDAG::updateInstruction(MachineInstr *NewMI, bool LastDef,
-                                          unsigned CurStageNum,
-                                          unsigned InstrStageNum,
-                                          SMSchedule &Schedule,
-                                          ValueMapTy *VRMap) {
-  for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = NewMI->getOperand(i);
-    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-      continue;
-    unsigned reg = MO.getReg();
-    if (MO.isDef()) {
-      // Create a new virtual register for the definition.
-      const TargetRegisterClass *RC = MRI.getRegClass(reg);
-      unsigned NewReg = MRI.createVirtualRegister(RC);
-      MO.setReg(NewReg);
-      VRMap[CurStageNum][reg] = NewReg;
-      if (LastDef)
-        replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
-    } else if (MO.isUse()) {
-      MachineInstr *Def = MRI.getVRegDef(reg);
-      // Compute the stage that contains the last definition for instruction.
-      int DefStageNum = Schedule.stageScheduled(getSUnit(Def));
-      unsigned StageNum = CurStageNum;
-      if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
-        // Compute the difference in stages between the defintion and the use.
-        unsigned StageDiff = (InstrStageNum - DefStageNum);
-        // Make an adjustment to get the last definition.
-        StageNum -= StageDiff;
-      }
-      if (VRMap[StageNum].count(reg))
-        MO.setReg(VRMap[StageNum][reg]);
-    }
-  }
-}
-
-/// Return the instruction in the loop that defines the register.
-/// If the definition is a Phi, then follow the Phi operand to
-/// the instruction in the loop.
-MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
-  SmallPtrSet<MachineInstr *, 8> Visited;
-  MachineInstr *Def = MRI.getVRegDef(Reg);
-  while (Def->isPHI()) {
-    if (!Visited.insert(Def).second)
-      break;
-    for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
-      if (Def->getOperand(i + 1).getMBB() == BB) {
-        Def = MRI.getVRegDef(Def->getOperand(i).getReg());
-        break;
-      }
-  }
-  return Def;
-}
-
-/// Return the new name for the value from the previous stage.
-unsigned SwingSchedulerDAG::getPrevMapVal(unsigned StageNum, unsigned PhiStage,
-                                          unsigned LoopVal, unsigned LoopStage,
-                                          ValueMapTy *VRMap,
-                                          MachineBasicBlock *BB) {
-  unsigned PrevVal = 0;
-  if (StageNum > PhiStage) {
-    MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
-    if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
-      // The name is defined in the previous stage.
-      PrevVal = VRMap[StageNum - 1][LoopVal];
-    else if (VRMap[StageNum].count(LoopVal))
-      // The previous name is defined in the current stage when the instruction
-      // order is swapped.
-      PrevVal = VRMap[StageNum][LoopVal];
-    else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
-      // The loop value hasn't yet been scheduled.
-      PrevVal = LoopVal;
-    else if (StageNum == PhiStage + 1)
-      // The loop value is another phi, which has not been scheduled.
-      PrevVal = getInitPhiReg(*LoopInst, BB);
-    else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
-      // The loop value is another phi, which has been scheduled.
-      PrevVal =
-          getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
-                        LoopStage, VRMap, BB);
-  }
-  return PrevVal;
-}
-
-/// Rewrite the Phi values in the specified block to use the mappings
-/// from the initial operand. Once the Phi is scheduled, we switch
-/// to using the loop value instead of the Phi value, so those names
-/// do not need to be rewritten.
-void SwingSchedulerDAG::rewritePhiValues(MachineBasicBlock *NewBB,
-                                         unsigned StageNum,
-                                         SMSchedule &Schedule,
-                                         ValueMapTy *VRMap,
-                                         InstrMapTy &InstrMap) {
-  for (auto &PHI : BB->phis()) {
-    unsigned InitVal = 0;
-    unsigned LoopVal = 0;
-    getPhiRegs(PHI, BB, InitVal, LoopVal);
-    unsigned PhiDef = PHI.getOperand(0).getReg();
-
-    unsigned PhiStage =
-        (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(PhiDef)));
-    unsigned LoopStage =
-        (unsigned)Schedule.stageScheduled(getSUnit(MRI.getVRegDef(LoopVal)));
-    unsigned NumPhis = Schedule.getStagesForPhi(PhiDef);
-    if (NumPhis > StageNum)
-      NumPhis = StageNum;
-    for (unsigned np = 0; np <= NumPhis; ++np) {
-      unsigned NewVal =
-          getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
-      if (!NewVal)
-        NewVal = InitVal;
-      rewriteScheduledInstr(NewBB, Schedule, InstrMap, StageNum - np, np, &PHI,
-                            PhiDef, NewVal);
-    }
-  }
-}
-
-/// Rewrite a previously scheduled instruction to use the register value
-/// from the new instruction. Make sure the instruction occurs in the
-/// basic block, and we don't change the uses in the new instruction.
-void SwingSchedulerDAG::rewriteScheduledInstr(
-    MachineBasicBlock *BB, SMSchedule &Schedule, InstrMapTy &InstrMap,
-    unsigned CurStageNum, unsigned PhiNum, MachineInstr *Phi, unsigned OldReg,
-    unsigned NewReg, unsigned PrevReg) {
-  bool InProlog = (CurStageNum < Schedule.getMaxStageCount());
-  int StagePhi = Schedule.stageScheduled(getSUnit(Phi)) + PhiNum;
-  // Rewrite uses that have been scheduled already to use the new
-  // Phi register.
-  for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(OldReg),
-                                         EI = MRI.use_end();
-       UI != EI;) {
-    MachineOperand &UseOp = *UI;
-    MachineInstr *UseMI = UseOp.getParent();
-    ++UI;
-    if (UseMI->getParent() != BB)
-      continue;
-    if (UseMI->isPHI()) {
-      if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
-        continue;
-      if (getLoopPhiReg(*UseMI, BB) != OldReg)
-        continue;
-    }
-    InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
-    assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
-    SUnit *OrigMISU = getSUnit(OrigInstr->second);
-    int StageSched = Schedule.stageScheduled(OrigMISU);
-    int CycleSched = Schedule.cycleScheduled(OrigMISU);
-    unsigned ReplaceReg = 0;
-    // This is the stage for the scheduled instruction.
-    if (StagePhi == StageSched && Phi->isPHI()) {
-      int CyclePhi = Schedule.cycleScheduled(getSUnit(Phi));
-      if (PrevReg && InProlog)
-        ReplaceReg = PrevReg;
-      else if (PrevReg && !Schedule.isLoopCarried(this, *Phi) &&
-               (CyclePhi <= CycleSched || OrigMISU->getInstr()->isPHI()))
-        ReplaceReg = PrevReg;
-      else
-        ReplaceReg = NewReg;
-    }
-    // The scheduled instruction occurs before the scheduled Phi, and the
-    // Phi is not loop carried.
-    if (!InProlog && StagePhi + 1 == StageSched &&
-        !Schedule.isLoopCarried(this, *Phi))
-      ReplaceReg = NewReg;
-    if (StagePhi > StageSched && Phi->isPHI())
-      ReplaceReg = NewReg;
-    if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
-      ReplaceReg = NewReg;
-    if (ReplaceReg) {
-      MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
-      UseOp.setReg(ReplaceReg);
-    }
-  }
-}
-
 /// Check if we can change the instruction to use an offset value from the
 /// previous iteration. If so, return true and set the base and offset values
 /// so that we can rewrite the load, if necessary.
@@ -3147,7 +2101,7 @@ bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
   unsigned BasePosLd, OffsetPosLd;
   if (!TII->getBaseAndOffsetPosition(*MI, BasePosLd, OffsetPosLd))
     return false;
-  unsigned BaseReg = MI->getOperand(BasePosLd).getReg();
+  Register BaseReg = MI->getOperand(BasePosLd).getReg();
 
   // Look for the Phi instruction.
   MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
@@ -3202,7 +2156,7 @@ void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
     unsigned BasePos, OffsetPos;
     if (!TII->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
       return;
-    unsigned BaseReg = MI->getOperand(BasePos).getReg();
+    Register BaseReg = MI->getOperand(BasePos).getReg();
     MachineInstr *LoopDef = findDefInLoop(BaseReg);
     int DefStageNum = Schedule.stageScheduled(getSUnit(LoopDef));
     int DefCycleNum = Schedule.cycleScheduled(getSUnit(LoopDef));
@@ -3221,11 +2175,29 @@ void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
       NewMI->getOperand(OffsetPos).setImm(NewOffset);
       SU->setInstr(NewMI);
       MISUnitMap[NewMI] = SU;
-      NewMIs.insert(NewMI);
+      NewMIs[MI] = NewMI;
     }
   }
 }
 
+/// Return the instruction in the loop that defines the register.
+/// If the definition is a Phi, then follow the Phi operand to
+/// the instruction in the loop.
+MachineInstr *SwingSchedulerDAG::findDefInLoop(unsigned Reg) {
+  SmallPtrSet<MachineInstr *, 8> Visited;
+  MachineInstr *Def = MRI.getVRegDef(Reg);
+  while (Def->isPHI()) {
+    if (!Visited.insert(Def).second)
+      break;
+    for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
+      if (Def->getOperand(i + 1).getMBB() == BB) {
+        Def = MRI.getVRegDef(Def->getOperand(i).getReg());
+        break;
+      }
+  }
+  return Def;
+}
+
 /// Return true for an order or output dependence that is loop carried
 /// potentially. A dependence is loop carried if the destination defines a valu
 /// that may be used or defined by the source in a subsequent iteration.
@@ -3499,10 +2471,10 @@ void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
        ++I, ++Pos) {
     for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
       MachineOperand &MO = MI->getOperand(i);
-      if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
         continue;
 
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       unsigned BasePos, OffsetPos;
       if (ST.getInstrInfo()->getBaseAndOffsetPosition(*MI, BasePos, OffsetPos))
         if (MI->getOperand(BasePos).getReg() == Reg)
@@ -3676,7 +2648,7 @@ bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
     assert(StageDef != -1 && "Instruction should have been scheduled.");
     for (auto &SI : SU.Succs)
       if (SI.isAssignedRegDep())
-        if (ST.getRegisterInfo()->isPhysicalRegister(SI.getReg()))
+        if (Register::isPhysicalRegister(SI.getReg()))
           if (stageScheduled(SI.getSUnit()) != StageDef)
             return false;
   }
@@ -3810,7 +2782,7 @@ void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
             NewMI->getOperand(OffsetPos).setImm(NewOffset);
             SU->setInstr(NewMI);
             MISUnitMap[NewMI] = SU;
-            NewMIs.insert(NewMI);
+            NewMIs[MI] = NewMI;
           }
         }
         OverlapReg = 0;
@@ -3847,40 +2819,6 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
         ScheduledInstrs[cycle].push_front(*I);
     }
   }
-  // Iterate over the definitions in each instruction, and compute the
-  // stage difference for each use.  Keep the maximum value.
-  for (auto &I : InstrToCycle) {
-    int DefStage = stageScheduled(I.first);
-    MachineInstr *MI = I.first->getInstr();
-    for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
-      MachineOperand &Op = MI->getOperand(i);
-      if (!Op.isReg() || !Op.isDef())
-        continue;
-
-      unsigned Reg = Op.getReg();
-      unsigned MaxDiff = 0;
-      bool PhiIsSwapped = false;
-      for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(Reg),
-                                             EI = MRI.use_end();
-           UI != EI; ++UI) {
-        MachineOperand &UseOp = *UI;
-        MachineInstr *UseMI = UseOp.getParent();
-        SUnit *SUnitUse = SSD->getSUnit(UseMI);
-        int UseStage = stageScheduled(SUnitUse);
-        unsigned Diff = 0;
-        if (UseStage != -1 && UseStage >= DefStage)
-          Diff = UseStage - DefStage;
-        if (MI->isPHI()) {
-          if (isLoopCarried(SSD, *MI))
-            ++Diff;
-          else
-            PhiIsSwapped = true;
-        }
-        MaxDiff = std::max(Diff, MaxDiff);
-      }
-      RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
-    }
-  }
 
   // Erase all the elements in the later stages. Only one iteration should
   // remain in the scheduled list, and it contains all the instructions.
@@ -4085,4 +3023,3 @@ void ResourceManager::clearResources() {
     return DFAResources->clearResources();
   std::fill(ProcResourceCount.begin(), ProcResourceCount.end(), 0);
 }
-
diff --git a/lib/CodeGen/MachinePostDominators.cpp b/lib/CodeGen/MachinePostDominators.cpp
index 7f220ed1fd8f..f4daff667e86 100644
--- a/lib/CodeGen/MachinePostDominators.cpp
+++ b/lib/CodeGen/MachinePostDominators.cpp
@@ -17,7 +17,9 @@ using namespace llvm;
 
 namespace llvm {
 template class DominatorTreeBase<MachineBasicBlock, true>; // PostDomTreeBase
-}
+
+extern bool VerifyMachineDomInfo;
+} // namespace llvm
 
 char MachinePostDominatorTree::ID = 0;
 
@@ -25,33 +27,52 @@ char MachinePostDominatorTree::ID = 0;
 INITIALIZE_PASS(MachinePostDominatorTree, "machinepostdomtree",
                 "MachinePostDominator Tree Construction", true, true)
 
-MachinePostDominatorTree::MachinePostDominatorTree() : MachineFunctionPass(ID) {
+MachinePostDominatorTree::MachinePostDominatorTree()
+    : MachineFunctionPass(ID), PDT(nullptr) {
   initializeMachinePostDominatorTreePass(*PassRegistry::getPassRegistry());
-  DT = new PostDomTreeBase<MachineBasicBlock>();
 }
 
-FunctionPass *
-MachinePostDominatorTree::createMachinePostDominatorTreePass() {
+FunctionPass *MachinePostDominatorTree::createMachinePostDominatorTreePass() {
   return new MachinePostDominatorTree();
 }
 
-bool
-MachinePostDominatorTree::runOnMachineFunction(MachineFunction &F) {
-  DT->recalculate(F);
+bool MachinePostDominatorTree::runOnMachineFunction(MachineFunction &F) {
+  PDT = std::make_unique<PostDomTreeT>();
+  PDT->recalculate(F);
   return false;
 }
 
-MachinePostDominatorTree::~MachinePostDominatorTree() {
-  delete DT;
-}
-
-void
-MachinePostDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
+void MachinePostDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-void
-MachinePostDominatorTree::print(llvm::raw_ostream &OS, const Module *M) const {
-  DT->print(OS);
+MachineBasicBlock *MachinePostDominatorTree::findNearestCommonDominator(
+    ArrayRef<MachineBasicBlock *> Blocks) const {
+  assert(!Blocks.empty());
+
+  MachineBasicBlock *NCD = Blocks.front();
+  for (MachineBasicBlock *BB : Blocks.drop_front()) {
+    NCD = PDT->findNearestCommonDominator(NCD, BB);
+
+    // Stop when the root is reached.
+    if (PDT->isVirtualRoot(PDT->getNode(NCD)))
+      return nullptr;
+  }
+
+  return NCD;
+}
+
+void MachinePostDominatorTree::verifyAnalysis() const {
+  if (PDT && VerifyMachineDomInfo)
+    if (!PDT->verify(PostDomTreeT::VerificationLevel::Basic)) {
+      errs() << "MachinePostDominatorTree verification failed\n";
+
+      abort();
+    }
+}
+
+void MachinePostDominatorTree::print(llvm::raw_ostream &OS,
+                                     const Module *M) const {
+  PDT->print(OS);
 }
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index f0fd0405d69d..b88d4ea462ef 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -144,7 +144,7 @@ MachineRegisterInfo::recomputeRegClass(unsigned Reg) {
 }
 
 unsigned MachineRegisterInfo::createIncompleteVirtualRegister(StringRef Name) {
-  unsigned Reg = TargetRegisterInfo::index2VirtReg(getNumVirtRegs());
+  unsigned Reg = Register::index2VirtReg(getNumVirtRegs());
   VRegInfo.grow(Reg);
   RegAllocHints.grow(Reg);
   insertVRegByName(Name, Reg);
@@ -202,7 +202,7 @@ void MachineRegisterInfo::clearVirtRegTypes() { VRegToType.clear(); }
 void MachineRegisterInfo::clearVirtRegs() {
 #ifndef NDEBUG
   for (unsigned i = 0, e = getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     if (!VRegInfo[Reg].second)
       continue;
     verifyUseList(Reg);
@@ -255,7 +255,7 @@ void MachineRegisterInfo::verifyUseList(unsigned Reg) const {
 void MachineRegisterInfo::verifyUseLists() const {
 #ifndef NDEBUG
   for (unsigned i = 0, e = getNumVirtRegs(); i != e; ++i)
-    verifyUseList(TargetRegisterInfo::index2VirtReg(i));
+    verifyUseList(Register::index2VirtReg(i));
   for (unsigned i = 1, e = getTargetRegisterInfo()->getNumRegs(); i != e; ++i)
     verifyUseList(i);
 #endif
@@ -386,7 +386,7 @@ void MachineRegisterInfo::replaceRegWith(unsigned FromReg, unsigned ToReg) {
   for (reg_iterator I = reg_begin(FromReg), E = reg_end(); I != E; ) {
     MachineOperand &O = *I;
     ++I;
-    if (TargetRegisterInfo::isPhysicalRegister(ToReg)) {
+    if (Register::isPhysicalRegister(ToReg)) {
       O.substPhysReg(ToReg, *TRI);
     } else {
       O.setReg(ToReg);
@@ -498,7 +498,7 @@ MachineRegisterInfo::EmitLiveInCopies(MachineBasicBlock *EntryMBB,
 
 LaneBitmask MachineRegisterInfo::getMaxLaneMaskForVReg(unsigned Reg) const {
   // Lane masks are only defined for vregs.
-  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  assert(Register::isVirtualRegister(Reg));
   const TargetRegisterClass &TRC = *getRegClass(Reg);
   return TRC.getLaneMask();
 }
@@ -517,7 +517,7 @@ void MachineRegisterInfo::freezeReservedRegs(const MachineFunction &MF) {
 }
 
 bool MachineRegisterInfo::isConstantPhysReg(unsigned PhysReg) const {
-  assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
+  assert(Register::isPhysicalRegister(PhysReg));
 
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
   if (TRI->isConstantPhysReg(PhysReg))
diff --git a/lib/CodeGen/MachineSSAUpdater.cpp b/lib/CodeGen/MachineSSAUpdater.cpp
index e8b42047b49f..258a5f9e0482 100644
--- a/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/lib/CodeGen/MachineSSAUpdater.cpp
@@ -95,7 +95,7 @@ unsigned LookForIdenticalPHI(MachineBasicBlock *BB,
   while (I != BB->end() && I->isPHI()) {
     bool Same = true;
     for (unsigned i = 1, e = I->getNumOperands(); i != e; i += 2) {
-      unsigned SrcReg = I->getOperand(i).getReg();
+      Register SrcReg = I->getOperand(i).getReg();
       MachineBasicBlock *SrcBB = I->getOperand(i+1).getMBB();
       if (AVals[SrcBB] != SrcReg) {
         Same = false;
@@ -118,7 +118,7 @@ MachineInstrBuilder InsertNewDef(unsigned Opcode,
                            const TargetRegisterClass *RC,
                            MachineRegisterInfo *MRI,
                            const TargetInstrInfo *TII) {
-  unsigned NewVR = MRI->createVirtualRegister(RC);
+  Register NewVR = MRI->createVirtualRegister(RC);
   return BuildMI(*BB, I, DebugLoc(), TII->get(Opcode), NewVR);
 }
 
@@ -292,7 +292,7 @@ public:
                               MachineSSAUpdater *Updater) {
     // Insert an implicit_def to represent an undef value.
     MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF,
-                                        BB, BB->getFirstTerminator(),
+                                        BB, BB->getFirstNonPHI(),
                                         Updater->VRC, Updater->MRI,
                                         Updater->TII);
     return NewDef->getOperand(0).getReg();
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index ae1170ad1be6..f0721ea3b76d 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -82,6 +82,10 @@ cl::opt<bool>
 DumpCriticalPathLength("misched-dcpl", cl::Hidden,
                        cl::desc("Print critical path length to stdout"));
 
+cl::opt<bool> VerifyScheduling(
+    "verify-misched", cl::Hidden,
+    cl::desc("Verify machine instrs before and after machine scheduling"));
+
 } // end namespace llvm
 
 #ifndef NDEBUG
@@ -122,9 +126,6 @@ static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
                                         cl::desc("Enable memop clustering."),
                                         cl::init(true));
 
-static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden,
-  cl::desc("Verify machine instrs before and after machine scheduling"));
-
 // DAG subtrees must have at least this many nodes.
 static const unsigned MinSubtreeSize = 8;
 
@@ -198,6 +199,7 @@ char &llvm::MachineSchedulerID = MachineScheduler::ID;
 INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE,
                       "Machine Instruction Scheduler", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
@@ -210,7 +212,7 @@ MachineScheduler::MachineScheduler() : MachineSchedulerBase(ID) {
 
 void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
-  AU.addRequiredID(MachineDominatorsID);
+  AU.addRequired<MachineDominatorTree>();
   AU.addRequired<MachineLoopInfo>();
   AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<TargetPassConfig>();
@@ -234,7 +236,7 @@ PostMachineScheduler::PostMachineScheduler() : MachineSchedulerBase(ID) {
 
 void PostMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
-  AU.addRequiredID(MachineDominatorsID);
+  AU.addRequired<MachineDominatorTree>();
   AU.addRequired<MachineLoopInfo>();
   AU.addRequired<TargetPassConfig>();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -933,8 +935,8 @@ void ScheduleDAGMILive::collectVRegUses(SUnit &SU) {
     if (TrackLaneMasks && !MO.isUse())
       continue;
 
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
       continue;
 
     // Ignore re-defs.
@@ -985,7 +987,7 @@ void ScheduleDAGMILive::enterRegion(MachineBasicBlock *bb,
          "ShouldTrackLaneMasks requires ShouldTrackPressure");
 }
 
-// Setup the register pressure trackers for the top scheduled top and bottom
+// Setup the register pressure trackers for the top scheduled and bottom
 // scheduled regions.
 void ScheduleDAGMILive::initRegPressure() {
   VRegUses.clear();
@@ -1095,7 +1097,7 @@ void ScheduleDAGMILive::updatePressureDiffs(
   for (const RegisterMaskPair &P : LiveUses) {
     unsigned Reg = P.RegUnit;
     /// FIXME: Currently assuming single-use physregs.
-    if (!TRI->isVirtualRegister(Reg))
+    if (!Register::isVirtualRegister(Reg))
       continue;
 
     if (ShouldTrackLaneMasks) {
@@ -1319,8 +1321,8 @@ unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
   // Visit each live out vreg def to find def/use pairs that cross iterations.
   for (const RegisterMaskPair &P : RPTracker.getPressure().LiveOutRegs) {
     unsigned Reg = P.RegUnit;
-    if (!TRI->isVirtualRegister(Reg))
-        continue;
+    if (!Register::isVirtualRegister(Reg))
+      continue;
     const LiveInterval &LI = LIS->getInterval(Reg);
     const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
     if (!DefVNI)
@@ -1538,14 +1540,14 @@ namespace llvm {
 std::unique_ptr<ScheduleDAGMutation>
 createLoadClusterDAGMutation(const TargetInstrInfo *TII,
                              const TargetRegisterInfo *TRI) {
-  return EnableMemOpCluster ? llvm::make_unique<LoadClusterMutation>(TII, TRI)
+  return EnableMemOpCluster ? std::make_unique<LoadClusterMutation>(TII, TRI)
                             : nullptr;
 }
 
 std::unique_ptr<ScheduleDAGMutation>
 createStoreClusterDAGMutation(const TargetInstrInfo *TII,
                               const TargetRegisterInfo *TRI) {
-  return EnableMemOpCluster ? llvm::make_unique<StoreClusterMutation>(TII, TRI)
+  return EnableMemOpCluster ? std::make_unique<StoreClusterMutation>(TII, TRI)
                             : nullptr;
 }
 
@@ -1657,7 +1659,7 @@ namespace llvm {
 std::unique_ptr<ScheduleDAGMutation>
 createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
                                const TargetRegisterInfo *TRI) {
-  return llvm::make_unique<CopyConstrain>(TII, TRI);
+  return std::make_unique<CopyConstrain>(TII, TRI);
 }
 
 } // end namespace llvm
@@ -1687,13 +1689,13 @@ void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
 
   // Check for pure vreg copies.
   const MachineOperand &SrcOp = Copy->getOperand(1);
-  unsigned SrcReg = SrcOp.getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || !SrcOp.readsReg())
+  Register SrcReg = SrcOp.getReg();
+  if (!Register::isVirtualRegister(SrcReg) || !SrcOp.readsReg())
     return;
 
   const MachineOperand &DstOp = Copy->getOperand(0);
-  unsigned DstReg = DstOp.getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(DstReg) || DstOp.isDead())
+  Register DstReg = DstOp.getReg();
+  if (!Register::isVirtualRegister(DstReg) || DstOp.isDead())
     return;
 
   // Check if either the dest or source is local. If it's live across a back
@@ -2914,14 +2916,12 @@ int biasPhysReg(const SUnit *SU, bool isTop) {
     unsigned UnscheduledOper = isTop ? 0 : 1;
     // If we have already scheduled the physreg produce/consumer, immediately
     // schedule the copy.
-    if (TargetRegisterInfo::isPhysicalRegister(
-            MI->getOperand(ScheduledOper).getReg()))
+    if (Register::isPhysicalRegister(MI->getOperand(ScheduledOper).getReg()))
       return 1;
     // If the physreg is at the boundary, defer it. Otherwise schedule it
     // immediately to free the dependent. We can hoist the copy later.
     bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft;
-    if (TargetRegisterInfo::isPhysicalRegister(
-            MI->getOperand(UnscheduledOper).getReg()))
+    if (Register::isPhysicalRegister(MI->getOperand(UnscheduledOper).getReg()))
       return AtBoundary ? -1 : 1;
   }
 
@@ -2931,7 +2931,7 @@ int biasPhysReg(const SUnit *SU, bool isTop) {
     // physical registers.
     bool DoBias = true;
     for (const MachineOperand &Op : MI->defs()) {
-      if (Op.isReg() && !TargetRegisterInfo::isPhysicalRegister(Op.getReg())) {
+      if (Op.isReg() && !Register::isPhysicalRegister(Op.getReg())) {
         DoBias = false;
         break;
       }
@@ -3259,7 +3259,8 @@ void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) {
   // Find already scheduled copies with a single physreg dependence and move
   // them just above the scheduled instruction.
   for (SDep &Dep : Deps) {
-    if (Dep.getKind() != SDep::Data || !TRI->isPhysicalRegister(Dep.getReg()))
+    if (Dep.getKind() != SDep::Data ||
+        !Register::isPhysicalRegister(Dep.getReg()))
       continue;
     SUnit *DepSU = Dep.getSUnit();
     if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1)
@@ -3298,7 +3299,7 @@ void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
 /// default scheduler if the target does not set a default.
 ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
-      new ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C));
+      new ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C));
   // Register DAG post-processors.
   //
   // FIXME: extend the mutation API to allow earlier mutations to instantiate
@@ -3450,7 +3451,7 @@ void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
 }
 
 ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) {
-  return new ScheduleDAGMI(C, llvm::make_unique<PostGenericScheduler>(C),
+  return new ScheduleDAGMI(C, std::make_unique<PostGenericScheduler>(C),
                            /*RemoveKillFlags=*/true);
 }
 
@@ -3561,10 +3562,10 @@ public:
 } // end anonymous namespace
 
 static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMILive(C, llvm::make_unique<ILPScheduler>(true));
+  return new ScheduleDAGMILive(C, std::make_unique<ILPScheduler>(true));
 }
 static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMILive(C, llvm::make_unique<ILPScheduler>(false));
+  return new ScheduleDAGMILive(C, std::make_unique<ILPScheduler>(false));
 }
 
 static MachineSchedRegistry ILPMaxRegistry(
@@ -3658,7 +3659,7 @@ static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) {
   assert((TopDown || !ForceTopDown) &&
          "-misched-topdown incompatible with -misched-bottomup");
   return new ScheduleDAGMILive(
-      C, llvm::make_unique<InstructionShuffler>(Alternate, TopDown));
+      C, std::make_unique<InstructionShuffler>(Alternate, TopDown));
 }
 
 static MachineSchedRegistry ShufflerRegistry(
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index 41db2c88ce50..27a2e7023f22 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -36,8 +36,9 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/BranchProbability.h"
 #include "llvm/Support/CommandLine.h"
@@ -114,15 +115,12 @@ namespace {
     bool runOnMachineFunction(MachineFunction &MF) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
       AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<MachineDominatorTree>();
       AU.addRequired<MachinePostDominatorTree>();
       AU.addRequired<MachineLoopInfo>();
       AU.addRequired<MachineBranchProbabilityInfo>();
-      AU.addPreserved<MachineDominatorTree>();
-      AU.addPreserved<MachinePostDominatorTree>();
       AU.addPreserved<MachineLoopInfo>();
       if (UseBlockFreqInfo)
         AU.addRequired<MachineBlockFrequencyInfo>();
@@ -195,11 +193,10 @@ bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr &MI,
   if (!MI.isCopy())
     return false;
 
-  unsigned SrcReg = MI.getOperand(1).getReg();
-  unsigned DstReg = MI.getOperand(0).getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
-      !TargetRegisterInfo::isVirtualRegister(DstReg) ||
-      !MRI->hasOneNonDBGUse(SrcReg))
+  Register SrcReg = MI.getOperand(1).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
+  if (!Register::isVirtualRegister(SrcReg) ||
+      !Register::isVirtualRegister(DstReg) || !MRI->hasOneNonDBGUse(SrcReg))
     return false;
 
   const TargetRegisterClass *SRC = MRI->getRegClass(SrcReg);
@@ -233,8 +230,7 @@ MachineSinking::AllUsesDominatedByBlock(unsigned Reg,
                                         MachineBasicBlock *DefMBB,
                                         bool &BreakPHIEdge,
                                         bool &LocalUse) const {
-  assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
-         "Only makes sense for vregs");
+  assert(Register::isVirtualRegister(Reg) && "Only makes sense for vregs");
 
   // Ignore debug uses because debug info doesn't affect the code.
   if (MRI->use_nodbg_empty(Reg))
@@ -416,13 +412,13 @@ bool MachineSinking::isWorthBreakingCriticalEdge(MachineInstr &MI,
     const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg() || !MO.isUse())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0)
       continue;
 
     // We don't move live definitions of physical registers,
     // so sinking their uses won't enable any opportunities.
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    if (Register::isPhysicalRegister(Reg))
       continue;
 
     // If this instruction is the only user of a virtual register,
@@ -615,10 +611,10 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
     const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg()) continue;  // Ignore non-register operands.
 
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0) continue;
 
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    if (Register::isPhysicalRegister(Reg)) {
       if (MO.isUse()) {
         // If the physreg has no defs anywhere, it's just an ambient register
         // and we can freely move its uses. Alternatively, if it's allocatable,
@@ -817,8 +813,9 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
     const MachineOperand &MO = MI.getOperand(I);
     if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
-    if (Reg == 0 || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    Register Reg = MO.getReg();
+    if (Reg == 0 || !Register::isPhysicalRegister(Reg))
+      continue;
     if (SuccToSinkTo->isLiveIn(Reg))
       return false;
   }
@@ -958,8 +955,9 @@ private:
   /// Track which register units have been modified and used.
   LiveRegUnits ModifiedRegUnits, UsedRegUnits;
 
-  /// Track DBG_VALUEs of (unmodified) register units.
-  DenseMap<unsigned, TinyPtrVector<MachineInstr*>> SeenDbgInstrs;
+  /// Track DBG_VALUEs of (unmodified) register units. Each DBG_VALUE has an
+  /// entry in this map for each unit it touches.
+  DenseMap<unsigned, TinyPtrVector<MachineInstr *>> SeenDbgInstrs;
 
   /// Sink Copy instructions unused in the same block close to their uses in
   /// successors.
@@ -1030,7 +1028,7 @@ static void clearKillFlags(MachineInstr *MI, MachineBasicBlock &CurBB,
                            const TargetRegisterInfo *TRI) {
   for (auto U : UsedOpsInCopy) {
     MachineOperand &MO = MI->getOperand(U);
-    unsigned SrcReg = MO.getReg();
+    Register SrcReg = MO.getReg();
     if (!UsedRegUnits.available(SrcReg)) {
       MachineBasicBlock::iterator NI = std::next(MI->getIterator());
       for (MachineInstr &UI : make_range(NI, CurBB.end())) {
@@ -1053,7 +1051,7 @@ static void updateLiveIn(MachineInstr *MI, MachineBasicBlock *SuccBB,
     for (MCSubRegIterator S(DefReg, TRI, true); S.isValid(); ++S)
       SuccBB->removeLiveIn(*S);
   for (auto U : UsedOpsInCopy) {
-    unsigned Reg = MI->getOperand(U).getReg();
+    Register Reg = MI->getOperand(U).getReg();
     if (!SuccBB->isLiveIn(Reg))
       SuccBB->addLiveIn(Reg);
   }
@@ -1069,7 +1067,7 @@ static bool hasRegisterDependency(MachineInstr *MI,
     MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
     if (MO.isDef()) {
@@ -1094,6 +1092,14 @@ static bool hasRegisterDependency(MachineInstr *MI,
   return HasRegDependency;
 }
 
+static SmallSet<unsigned, 4> getRegUnits(unsigned Reg,
+                                         const TargetRegisterInfo *TRI) {
+  SmallSet<unsigned, 4> RegUnits;
+  for (auto RI = MCRegUnitIterator(Reg, TRI); RI.isValid(); ++RI)
+    RegUnits.insert(*RI);
+  return RegUnits;
+}
+
 bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
                                          MachineFunction &MF,
                                          const TargetRegisterInfo *TRI,
@@ -1130,15 +1136,17 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
     // for DBG_VALUEs later, record them when they're encountered.
     if (MI->isDebugValue()) {
       auto &MO = MI->getOperand(0);
-      if (MO.isReg() && TRI->isPhysicalRegister(MO.getReg())) {
+      if (MO.isReg() && Register::isPhysicalRegister(MO.getReg())) {
         // Bail if we can already tell the sink would be rejected, rather
         // than needlessly accumulating lots of DBG_VALUEs.
         if (hasRegisterDependency(MI, UsedOpsInCopy, DefedRegsInCopy,
                                   ModifiedRegUnits, UsedRegUnits))
           continue;
 
-        // Record debug use of this register.
-        SeenDbgInstrs[MO.getReg()].push_back(MI);
+        // Record debug use of each reg unit.
+        SmallSet<unsigned, 4> Units = getRegUnits(MO.getReg(), TRI);
+        for (unsigned Reg : Units)
+          SeenDbgInstrs[Reg].push_back(MI);
       }
       continue;
     }
@@ -1177,15 +1185,22 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
     assert((SuccBB->pred_size() == 1 && *SuccBB->pred_begin() == &CurBB) &&
            "Unexpected predecessor");
 
-    // Collect DBG_VALUEs that must sink with this copy.
+    // Collect DBG_VALUEs that must sink with this copy. We've previously
+    // recorded which reg units that DBG_VALUEs read, if this instruction
+    // writes any of those units then the corresponding DBG_VALUEs must sink.
+    SetVector<MachineInstr *> DbgValsToSinkSet;
     SmallVector<MachineInstr *, 4> DbgValsToSink;
     for (auto &MO : MI->operands()) {
       if (!MO.isReg() || !MO.isDef())
         continue;
-      unsigned reg = MO.getReg();
-      for (auto *MI : SeenDbgInstrs.lookup(reg))
-        DbgValsToSink.push_back(MI);
+
+      SmallSet<unsigned, 4> Units = getRegUnits(MO.getReg(), TRI);
+      for (unsigned Reg : Units)
+        for (auto *MI : SeenDbgInstrs.lookup(Reg))
+          DbgValsToSinkSet.insert(MI);
     }
+    DbgValsToSink.insert(DbgValsToSink.begin(), DbgValsToSinkSet.begin(),
+                         DbgValsToSinkSet.end());
 
     // Clear the kill flag if SrcReg is killed between MI and the end of the
     // block.
diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index f9505df4e7f4..66a3bc2f8cc4 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -634,7 +634,7 @@ struct DataDep {
   /// Create a DataDep from an SSA form virtual register.
   DataDep(const MachineRegisterInfo *MRI, unsigned VirtReg, unsigned UseOp)
     : UseOp(UseOp) {
-    assert(TargetRegisterInfo::isVirtualRegister(VirtReg));
+    assert(Register::isVirtualRegister(VirtReg));
     MachineRegisterInfo::def_iterator DefI = MRI->def_begin(VirtReg);
     assert(!DefI.atEnd() && "Register has no defs");
     DefMI = DefI->getParent();
@@ -660,10 +660,10 @@ static bool getDataDeps(const MachineInstr &UseMI,
     const MachineOperand &MO = *I;
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    if (Register::isPhysicalRegister(Reg)) {
       HasPhysRegs = true;
       continue;
     }
@@ -687,7 +687,7 @@ static void getPHIDeps(const MachineInstr &UseMI,
   assert(UseMI.isPHI() && UseMI.getNumOperands() % 2 && "Bad PHI");
   for (unsigned i = 1; i != UseMI.getNumOperands(); i += 2) {
     if (UseMI.getOperand(i + 1).getMBB() == Pred) {
-      unsigned Reg = UseMI.getOperand(i).getReg();
+      Register Reg = UseMI.getOperand(i).getReg();
       Deps.push_back(DataDep(MRI, Reg, i));
       return;
     }
@@ -708,8 +708,8 @@ static void updatePhysDepsDownwards(const MachineInstr *UseMI,
     const MachineOperand &MO = *MI;
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isPhysicalRegister(Reg))
       continue;
     // Track live defs and kills for updating RegUnits.
     if (MO.isDef()) {
@@ -765,7 +765,7 @@ computeCrossBlockCriticalPath(const TraceBlockInfo &TBI) {
   assert(TBI.HasValidInstrHeights && "Missing height info");
   unsigned MaxLen = 0;
   for (const LiveInReg &LIR : TBI.LiveIns) {
-    if (!TargetRegisterInfo::isVirtualRegister(LIR.Reg))
+    if (!Register::isVirtualRegister(LIR.Reg))
       continue;
     const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg);
     // Ignore dependencies outside the current trace.
@@ -902,8 +902,8 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,
     const MachineOperand &MO = *MOI;
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isPhysicalRegister(Reg))
       continue;
     if (MO.readsReg())
       ReadOps.push_back(MI.getOperandNo(MOI));
@@ -930,7 +930,7 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,
 
   // Now we know the height of MI. Update any regunits read.
   for (unsigned i = 0, e = ReadOps.size(); i != e; ++i) {
-    unsigned Reg = MI.getOperand(ReadOps[i]).getReg();
+    Register Reg = MI.getOperand(ReadOps[i]).getReg();
     for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
       LiveRegUnit &LRU = RegUnits[*Units];
       // Set the height to the highest reader of the unit.
@@ -979,7 +979,7 @@ addLiveIns(const MachineInstr *DefMI, unsigned DefOp,
            ArrayRef<const MachineBasicBlock*> Trace) {
   assert(!Trace.empty() && "Trace should contain at least one block");
   unsigned Reg = DefMI->getOperand(DefOp).getReg();
-  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  assert(Register::isVirtualRegister(Reg));
   const MachineBasicBlock *DefMBB = DefMI->getParent();
 
   // Reg is live-in to all blocks in Trace that follow DefMBB.
@@ -1026,7 +1026,7 @@ computeInstrHeights(const MachineBasicBlock *MBB) {
   if (MBB) {
     TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
     for (LiveInReg &LI : TBI.LiveIns) {
-      if (TargetRegisterInfo::isVirtualRegister(LI.Reg)) {
+      if (Register::isVirtualRegister(LI.Reg)) {
         // For virtual registers, the def latency is included.
         unsigned &Height = Heights[MTM.MRI->getVRegDef(LI.Reg)];
         if (Height < LI.Height)
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 0ad792ac62cf..969743edca52 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -22,7 +22,6 @@
 // the verifier errors.
 //===----------------------------------------------------------------------===//
 
-#include "LiveRangeCalc.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
@@ -37,6 +36,7 @@
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRangeCalc.h"
 #include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -122,7 +122,7 @@ namespace {
     // Add Reg and any sub-registers to RV
     void addRegWithSubRegs(RegVector &RV, unsigned Reg) {
       RV.push_back(Reg);
-      if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      if (Register::isPhysicalRegister(Reg))
         for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
           RV.push_back(*SubRegs);
     }
@@ -159,7 +159,7 @@ namespace {
       // Add register to vregsPassed if it belongs there. Return true if
       // anything changed.
       bool addPassed(unsigned Reg) {
-        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        if (!Register::isVirtualRegister(Reg))
           return false;
         if (regsKilled.count(Reg) || regsLiveOut.count(Reg))
           return false;
@@ -178,7 +178,7 @@ namespace {
       // Add register to vregsRequired if it belongs there. Return true if
       // anything changed.
       bool addRequired(unsigned Reg) {
-        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        if (!Register::isVirtualRegister(Reg))
           return false;
         if (regsLiveOut.count(Reg))
           return false;
@@ -552,7 +552,7 @@ void MachineVerifier::report_context_vreg(unsigned VReg) const {
 }
 
 void MachineVerifier::report_context_vreg_regunit(unsigned VRegOrUnit) const {
-  if (TargetRegisterInfo::isVirtualRegister(VRegOrUnit)) {
+  if (Register::isVirtualRegister(VRegOrUnit)) {
     report_context_vreg(VRegOrUnit);
   } else {
     errs() << "- regunit:     " << printRegUnit(VRegOrUnit, TRI) << '\n';
@@ -797,7 +797,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   regsLive.clear();
   if (MRI->tracksLiveness()) {
     for (const auto &LI : MBB->liveins()) {
-      if (!TargetRegisterInfo::isPhysicalRegister(LI.PhysReg)) {
+      if (!Register::isPhysicalRegister(LI.PhysReg)) {
         report("MBB live-in list contains non-physical register", MBB);
         continue;
       }
@@ -957,7 +957,7 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
   // Generic opcodes must not have physical register operands.
   for (unsigned I = 0; I < MI->getNumOperands(); ++I) {
     const MachineOperand *MO = &MI->getOperand(I);
-    if (MO->isReg() && TargetRegisterInfo::isPhysicalRegister(MO->getReg()))
+    if (MO->isReg() && Register::isPhysicalRegister(MO->getReg()))
       report("Generic instruction cannot have physical register", MO, I);
   }
 
@@ -1368,7 +1368,108 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
         break;
       }
     }
+    switch (IntrID) {
+    case Intrinsic::memcpy:
+      if (MI->getNumOperands() != 5)
+        report("Expected memcpy intrinsic to have 5 operands", MI);
+      break;
+    case Intrinsic::memmove:
+      if (MI->getNumOperands() != 5)
+        report("Expected memmove intrinsic to have 5 operands", MI);
+      break;
+    case Intrinsic::memset:
+      if (MI->getNumOperands() != 5)
+        report("Expected memset intrinsic to have 5 operands", MI);
+      break;
+    }
+    break;
+  }
+  case TargetOpcode::G_SEXT_INREG: {
+    if (!MI->getOperand(2).isImm()) {
+      report("G_SEXT_INREG expects an immediate operand #2", MI);
+      break;
+    }
+
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    verifyVectorElementMatch(DstTy, SrcTy, MI);
+
+    int64_t Imm = MI->getOperand(2).getImm();
+    if (Imm <= 0)
+      report("G_SEXT_INREG size must be >= 1", MI);
+    if (Imm >= SrcTy.getScalarSizeInBits())
+      report("G_SEXT_INREG size must be less than source bit width", MI);
+    break;
+  }
+  case TargetOpcode::G_SHUFFLE_VECTOR: {
+    const MachineOperand &MaskOp = MI->getOperand(3);
+    if (!MaskOp.isShuffleMask()) {
+      report("Incorrect mask operand type for G_SHUFFLE_VECTOR", MI);
+      break;
+    }
+
+    const Constant *Mask = MaskOp.getShuffleMask();
+    auto *MaskVT = dyn_cast<VectorType>(Mask->getType());
+    if (!MaskVT || !MaskVT->getElementType()->isIntegerTy(32)) {
+      report("Invalid shufflemask constant type", MI);
+      break;
+    }
+
+    if (!Mask->getAggregateElement(0u)) {
+      report("Invalid shufflemask constant type", MI);
+      break;
+    }
 
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT Src0Ty = MRI->getType(MI->getOperand(1).getReg());
+    LLT Src1Ty = MRI->getType(MI->getOperand(2).getReg());
+
+    if (Src0Ty != Src1Ty)
+      report("Source operands must be the same type", MI);
+
+    if (Src0Ty.getScalarType() != DstTy.getScalarType())
+      report("G_SHUFFLE_VECTOR cannot change element type", MI);
+
+    // Don't check that all operands are vector because scalars are used in
+    // place of 1 element vectors.
+    int SrcNumElts = Src0Ty.isVector() ? Src0Ty.getNumElements() : 1;
+    int DstNumElts = DstTy.isVector() ? DstTy.getNumElements() : 1;
+
+    SmallVector<int, 32> MaskIdxes;
+    ShuffleVectorInst::getShuffleMask(Mask, MaskIdxes);
+
+    if (static_cast<int>(MaskIdxes.size()) != DstNumElts)
+      report("Wrong result type for shufflemask", MI);
+
+    for (int Idx : MaskIdxes) {
+      if (Idx < 0)
+        continue;
+
+      if (Idx >= 2 * SrcNumElts)
+        report("Out of bounds shuffle index", MI);
+    }
+
+    break;
+  }
+  case TargetOpcode::G_DYN_STACKALLOC: {
+    const MachineOperand &DstOp = MI->getOperand(0);
+    const MachineOperand &AllocOp = MI->getOperand(1);
+    const MachineOperand &AlignOp = MI->getOperand(2);
+
+    if (!DstOp.isReg() || !MRI->getType(DstOp.getReg()).isPointer()) {
+      report("dst operand 0 must be a pointer type", MI);
+      break;
+    }
+
+    if (!AllocOp.isReg() || !MRI->getType(AllocOp.getReg()).isScalar()) {
+      report("src operand 1 must be a scalar reg type", MI);
+      break;
+    }
+
+    if (!AlignOp.isImm()) {
+      report("src operand 2 must be an immediate type", MI);
+      break;
+    }
     break;
   }
   default:
@@ -1525,11 +1626,11 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
         report("Operand should be tied", MO, MONum);
       else if (unsigned(TiedTo) != MI->findTiedOperandIdx(MONum))
         report("Tied def doesn't match MCInstrDesc", MO, MONum);
-      else if (TargetRegisterInfo::isPhysicalRegister(MO->getReg())) {
+      else if (Register::isPhysicalRegister(MO->getReg())) {
         const MachineOperand &MOTied = MI->getOperand(TiedTo);
         if (!MOTied.isReg())
           report("Tied counterpart must be a register", &MOTied, TiedTo);
-        else if (TargetRegisterInfo::isPhysicalRegister(MOTied.getReg()) &&
+        else if (Register::isPhysicalRegister(MOTied.getReg()) &&
                  MO->getReg() != MOTied.getReg())
           report("Tied physical registers must match.", &MOTied, TiedTo);
       }
@@ -1543,7 +1644,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
 
   switch (MO->getType()) {
   case MachineOperand::MO_Register: {
-    const unsigned Reg = MO->getReg();
+    const Register Reg = MO->getReg();
     if (!Reg)
       return;
     if (MRI->tracksLiveness() && !MI->isDebugValue())
@@ -1581,7 +1682,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
     // Check register classes.
     unsigned SubIdx = MO->getSubReg();
 
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    if (Register::isPhysicalRegister(Reg)) {
       if (SubIdx) {
         report("Illegal subregister index for physical register", MO, MONum);
         return;
@@ -1817,7 +1918,7 @@ void MachineVerifier::checkLivenessAtDef(const MachineOperand *MO,
   if (MO->isDead()) {
     LiveQueryResult LRQ = LR.Query(DefIdx);
     if (!LRQ.isDeadDef()) {
-      assert(TargetRegisterInfo::isVirtualRegister(VRegOrUnit) &&
+      assert(Register::isVirtualRegister(VRegOrUnit) &&
              "Expecting a virtual register.");
       // A dead subreg def only tells us that the specific subreg is dead. There
       // could be other non-dead defs of other subregs, or we could have other
@@ -1845,8 +1946,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
       addRegWithSubRegs(regsKilled, Reg);
 
     // Check that LiveVars knows this kill.
-    if (LiveVars && TargetRegisterInfo::isVirtualRegister(Reg) &&
-        MO->isKill()) {
+    if (LiveVars && Register::isVirtualRegister(Reg) && MO->isKill()) {
       LiveVariables::VarInfo &VI = LiveVars->getVarInfo(Reg);
       if (!is_contained(VI.Kills, MI))
         report("Kill missing from LiveVariables", MO, MONum);
@@ -1856,7 +1956,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
     if (LiveInts && !LiveInts->isNotInMIMap(*MI)) {
       SlotIndex UseIdx = LiveInts->getInstructionIndex(*MI);
       // Check the cached regunit intervals.
-      if (TargetRegisterInfo::isPhysicalRegister(Reg) && !isReserved(Reg)) {
+      if (Register::isPhysicalRegister(Reg) && !isReserved(Reg)) {
         for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
           if (MRI->isReservedRegUnit(*Units))
             continue;
@@ -1865,7 +1965,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
         }
       }
 
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         if (LiveInts->hasInterval(Reg)) {
           // This is a virtual register interval.
           const LiveInterval &LI = LiveInts->getInterval(Reg);
@@ -1900,7 +2000,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
 
     // Use of a dead register.
     if (!regsLive.count(Reg)) {
-      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (Register::isPhysicalRegister(Reg)) {
         // Reserved registers may be used even when 'dead'.
         bool Bad = !isReserved(Reg);
         // We are fine if just any subregister has a defined value.
@@ -1922,7 +2022,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
             if (!MOP.isReg() || !MOP.isImplicit())
               continue;
 
-            if (!TargetRegisterInfo::isPhysicalRegister(MOP.getReg()))
+            if (!Register::isPhysicalRegister(MOP.getReg()))
               continue;
 
             for (MCSubRegIterator SubRegs(MOP.getReg(), TRI); SubRegs.isValid();
@@ -1960,7 +2060,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
       addRegWithSubRegs(regsDefined, Reg);
 
     // Verify SSA form.
-    if (MRI->isSSA() && TargetRegisterInfo::isVirtualRegister(Reg) &&
+    if (MRI->isSSA() && Register::isVirtualRegister(Reg) &&
         std::next(MRI->def_begin(Reg)) != MRI->def_end())
       report("Multiple virtual register defs in SSA form", MO, MONum);
 
@@ -1969,7 +2069,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
       SlotIndex DefIdx = LiveInts->getInstructionIndex(*MI);
       DefIdx = DefIdx.getRegSlot(MO->isEarlyClobber());
 
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         if (LiveInts->hasInterval(Reg)) {
           const LiveInterval &LI = LiveInts->getInterval(Reg);
           checkLivenessAtDef(MO, MONum, DefIdx, LI, Reg);
@@ -2007,7 +2107,7 @@ void MachineVerifier::visitMachineBundleAfter(const MachineInstr *MI) {
   while (!regMasks.empty()) {
     const uint32_t *Mask = regMasks.pop_back_val();
     for (RegSet::iterator I = regsLive.begin(), E = regsLive.end(); I != E; ++I)
-      if (TargetRegisterInfo::isPhysicalRegister(*I) &&
+      if (Register::isPhysicalRegister(*I) &&
           MachineOperand::clobbersPhysReg(Mask, *I))
         regsDead.push_back(*I);
   }
@@ -2119,8 +2219,8 @@ void MachineVerifier::checkPHIOps(const MachineBasicBlock &MBB) {
     if (MODef.isTied() || MODef.isImplicit() || MODef.isInternalRead() ||
         MODef.isEarlyClobber() || MODef.isDebug())
       report("Unexpected flag on PHI operand", &MODef, 0);
-    unsigned DefReg = MODef.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(DefReg))
+    Register DefReg = MODef.getReg();
+    if (!Register::isVirtualRegister(DefReg))
       report("Expected first PHI operand to be a virtual register", &MODef, 0);
 
     for (unsigned I = 1, E = Phi.getNumOperands(); I != E; I += 2) {
@@ -2212,7 +2312,7 @@ void MachineVerifier::visitMachineFunctionAfter() {
 void MachineVerifier::verifyLiveVariables() {
   assert(LiveVars && "Don't call verifyLiveVariables without LiveVars");
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     LiveVariables::VarInfo &VI = LiveVars->getVarInfo(Reg);
     for (const auto &MBB : *MF) {
       BBInfo &MInfo = MBBInfoMap[&MBB];
@@ -2238,7 +2338,7 @@ void MachineVerifier::verifyLiveVariables() {
 void MachineVerifier::verifyLiveIntervals() {
   assert(LiveInts && "Don't call verifyLiveIntervals without LiveInts");
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
 
     // Spilling and splitting may leave unused registers around. Skip them.
     if (MRI->reg_nodbg_empty(Reg))
@@ -2315,11 +2415,11 @@ void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
     for (ConstMIBundleOperands MOI(*MI); MOI.isValid(); ++MOI) {
       if (!MOI->isReg() || !MOI->isDef())
         continue;
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         if (MOI->getReg() != Reg)
           continue;
       } else {
-        if (!TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) ||
+        if (!Register::isPhysicalRegister(MOI->getReg()) ||
             !TRI->hasRegUnit(MOI->getReg(), Reg))
           continue;
       }
@@ -2402,7 +2502,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
     return;
 
   // RegUnit intervals are allowed dead phis.
-  if (!TargetRegisterInfo::isVirtualRegister(Reg) && VNI->isPHIDef() &&
+  if (!Register::isVirtualRegister(Reg) && VNI->isPHIDef() &&
       S.start == VNI->def && S.end == VNI->def.getDeadSlot())
     return;
 
@@ -2446,7 +2546,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
 
   // The following checks only apply to virtual registers. Physreg liveness
   // is too weird to check.
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+  if (Register::isVirtualRegister(Reg)) {
     // A live segment can end with either a redefinition, a kill flag on a
     // use, or a dead flag on a def.
     bool hasRead = false;
@@ -2519,8 +2619,7 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
   while (true) {
     assert(LiveInts->isLiveInToMBB(LR, &*MFI));
     // We don't know how to track physregs into a landing pad.
-    if (!TargetRegisterInfo::isVirtualRegister(Reg) &&
-        MFI->isEHPad()) {
+    if (!Register::isVirtualRegister(Reg) && MFI->isEHPad()) {
       if (&*MFI == EndMBB)
         break;
       ++MFI;
@@ -2580,7 +2679,7 @@ void MachineVerifier::verifyLiveRange(const LiveRange &LR, unsigned Reg,
 
 void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
   unsigned Reg = LI.reg;
-  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  assert(Register::isVirtualRegister(Reg));
   verifyLiveRange(LI, Reg);
 
   LaneBitmask Mask;
diff --git a/lib/CodeGen/MacroFusion.cpp b/lib/CodeGen/MacroFusion.cpp
index 2db1e86905a4..d21eae222af0 100644
--- a/lib/CodeGen/MacroFusion.cpp
+++ b/lib/CodeGen/MacroFusion.cpp
@@ -176,7 +176,7 @@ std::unique_ptr<ScheduleDAGMutation>
 llvm::createMacroFusionDAGMutation(
      ShouldSchedulePredTy shouldScheduleAdjacent) {
   if(EnableMacroFusion)
-    return llvm::make_unique<MacroFusion>(shouldScheduleAdjacent, true);
+    return std::make_unique<MacroFusion>(shouldScheduleAdjacent, true);
   return nullptr;
 }
 
@@ -184,6 +184,6 @@ std::unique_ptr<ScheduleDAGMutation>
 llvm::createBranchMacroFusionDAGMutation(
      ShouldSchedulePredTy shouldScheduleAdjacent) {
   if(EnableMacroFusion)
-    return llvm::make_unique<MacroFusion>(shouldScheduleAdjacent, false);
+    return std::make_unique<MacroFusion>(shouldScheduleAdjacent, false);
   return nullptr;
 }
diff --git a/lib/CodeGen/ModuloSchedule.cpp b/lib/CodeGen/ModuloSchedule.cpp
new file mode 100644
index 000000000000..7ce3c5861801
--- /dev/null
+++ b/lib/CodeGen/ModuloSchedule.cpp
@@ -0,0 +1,2022 @@
+//===- ModuloSchedule.cpp - Software pipeline schedule expansion ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ModuloSchedule.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopUtils.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "pipeliner"
+using namespace llvm;
+
+void ModuloSchedule::print(raw_ostream &OS) {
+  for (MachineInstr *MI : ScheduledInstrs)
+    OS << "[stage " << getStage(MI) << " @" << getCycle(MI) << "c] " << *MI;
+}
+
+//===----------------------------------------------------------------------===//
+// ModuloScheduleExpander implementation
+//===----------------------------------------------------------------------===//
+
+/// Return the register values for  the operands of a Phi instruction.
+/// This function assume the instruction is a Phi.
+static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
+                       unsigned &InitVal, unsigned &LoopVal) {
+  assert(Phi.isPHI() && "Expecting a Phi.");
+
+  InitVal = 0;
+  LoopVal = 0;
+  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
+    if (Phi.getOperand(i + 1).getMBB() != Loop)
+      InitVal = Phi.getOperand(i).getReg();
+    else
+      LoopVal = Phi.getOperand(i).getReg();
+
+  assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
+}
+
+/// Return the Phi register value that comes from the incoming block.
+static unsigned getInitPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
+  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
+    if (Phi.getOperand(i + 1).getMBB() != LoopBB)
+      return Phi.getOperand(i).getReg();
+  return 0;
+}
+
+/// Return the Phi register value that comes the loop block.
+static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
+  for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
+    if (Phi.getOperand(i + 1).getMBB() == LoopBB)
+      return Phi.getOperand(i).getReg();
+  return 0;
+}
+
+void ModuloScheduleExpander::expand() {
+  BB = Schedule.getLoop()->getTopBlock();
+  Preheader = *BB->pred_begin();
+  if (Preheader == BB)
+    Preheader = *std::next(BB->pred_begin());
+
+  // Iterate over the definitions in each instruction, and compute the
+  // stage difference for each use.  Keep the maximum value.
+  for (MachineInstr *MI : Schedule.getInstructions()) {
+    int DefStage = Schedule.getStage(MI);
+    for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
+      MachineOperand &Op = MI->getOperand(i);
+      if (!Op.isReg() || !Op.isDef())
+        continue;
+
+      Register Reg = Op.getReg();
+      unsigned MaxDiff = 0;
+      bool PhiIsSwapped = false;
+      for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(Reg),
+                                             EI = MRI.use_end();
+           UI != EI; ++UI) {
+        MachineOperand &UseOp = *UI;
+        MachineInstr *UseMI = UseOp.getParent();
+        int UseStage = Schedule.getStage(UseMI);
+        unsigned Diff = 0;
+        if (UseStage != -1 && UseStage >= DefStage)
+          Diff = UseStage - DefStage;
+        if (MI->isPHI()) {
+          if (isLoopCarried(*MI))
+            ++Diff;
+          else
+            PhiIsSwapped = true;
+        }
+        MaxDiff = std::max(Diff, MaxDiff);
+      }
+      RegToStageDiff[Reg] = std::make_pair(MaxDiff, PhiIsSwapped);
+    }
+  }
+
+  generatePipelinedLoop();
+}
+
+void ModuloScheduleExpander::generatePipelinedLoop() {
+  LoopInfo = TII->analyzeLoopForPipelining(BB);
+  assert(LoopInfo && "Must be able to analyze loop!");
+
+  // Create a new basic block for the kernel and add it to the CFG.
+  MachineBasicBlock *KernelBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
+
+  unsigned MaxStageCount = Schedule.getNumStages() - 1;
+
+  // Remember the registers that are used in different stages. The index is
+  // the iteration, or stage, that the instruction is scheduled in.  This is
+  // a map between register names in the original block and the names created
+  // in each stage of the pipelined loop.
+  ValueMapTy *VRMap = new ValueMapTy[(MaxStageCount + 1) * 2];
+  InstrMapTy InstrMap;
+
+  SmallVector<MachineBasicBlock *, 4> PrologBBs;
+
+  // Generate the prolog instructions that set up the pipeline.
+  generateProlog(MaxStageCount, KernelBB, VRMap, PrologBBs);
+  MF.insert(BB->getIterator(), KernelBB);
+
+  // Rearrange the instructions to generate the new, pipelined loop,
+  // and update register names as needed.
+  for (MachineInstr *CI : Schedule.getInstructions()) {
+    if (CI->isPHI())
+      continue;
+    unsigned StageNum = Schedule.getStage(CI);
+    MachineInstr *NewMI = cloneInstr(CI, MaxStageCount, StageNum);
+    updateInstruction(NewMI, false, MaxStageCount, StageNum, VRMap);
+    KernelBB->push_back(NewMI);
+    InstrMap[NewMI] = CI;
+  }
+
+  // Copy any terminator instructions to the new kernel, and update
+  // names as needed.
+  for (MachineBasicBlock::iterator I = BB->getFirstTerminator(),
+                                   E = BB->instr_end();
+       I != E; ++I) {
+    MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
+    updateInstruction(NewMI, false, MaxStageCount, 0, VRMap);
+    KernelBB->push_back(NewMI);
+    InstrMap[NewMI] = &*I;
+  }
+
+  NewKernel = KernelBB;
+  KernelBB->transferSuccessors(BB);
+  KernelBB->replaceSuccessor(BB, KernelBB);
+
+  generateExistingPhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap,
+                       InstrMap, MaxStageCount, MaxStageCount, false);
+  generatePhis(KernelBB, PrologBBs.back(), KernelBB, KernelBB, VRMap, InstrMap,
+               MaxStageCount, MaxStageCount, false);
+
+  LLVM_DEBUG(dbgs() << "New block\n"; KernelBB->dump(););
+
+  SmallVector<MachineBasicBlock *, 4> EpilogBBs;
+  // Generate the epilog instructions to complete the pipeline.
+  generateEpilog(MaxStageCount, KernelBB, VRMap, EpilogBBs, PrologBBs);
+
+  // We need this step because the register allocation doesn't handle some
+  // situations well, so we insert copies to help out.
+  splitLifetimes(KernelBB, EpilogBBs);
+
+  // Remove dead instructions due to loop induction variables.
+  removeDeadInstructions(KernelBB, EpilogBBs);
+
+  // Add branches between prolog and epilog blocks.
+  addBranches(*Preheader, PrologBBs, KernelBB, EpilogBBs, VRMap);
+
+  delete[] VRMap;
+}
+
+void ModuloScheduleExpander::cleanup() {
+  // Remove the original loop since it's no longer referenced.
+  for (auto &I : *BB)
+    LIS.RemoveMachineInstrFromMaps(I);
+  BB->clear();
+  BB->eraseFromParent();
+}
+
+/// Generate the pipeline prolog code.
+void ModuloScheduleExpander::generateProlog(unsigned LastStage,
+                                            MachineBasicBlock *KernelBB,
+                                            ValueMapTy *VRMap,
+                                            MBBVectorTy &PrologBBs) {
+  MachineBasicBlock *PredBB = Preheader;
+  InstrMapTy InstrMap;
+
+  // Generate a basic block for each stage, not including the last stage,
+  // which will be generated in the kernel. Each basic block may contain
+  // instructions from multiple stages/iterations.
+  for (unsigned i = 0; i < LastStage; ++i) {
+    // Create and insert the prolog basic block prior to the original loop
+    // basic block.  The original loop is removed later.
+    MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
+    PrologBBs.push_back(NewBB);
+    MF.insert(BB->getIterator(), NewBB);
+    NewBB->transferSuccessors(PredBB);
+    PredBB->addSuccessor(NewBB);
+    PredBB = NewBB;
+
+    // Generate instructions for each appropriate stage. Process instructions
+    // in original program order.
+    for (int StageNum = i; StageNum >= 0; --StageNum) {
+      for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
+                                       BBE = BB->getFirstTerminator();
+           BBI != BBE; ++BBI) {
+        if (Schedule.getStage(&*BBI) == StageNum) {
+          if (BBI->isPHI())
+            continue;
+          MachineInstr *NewMI =
+              cloneAndChangeInstr(&*BBI, i, (unsigned)StageNum);
+          updateInstruction(NewMI, false, i, (unsigned)StageNum, VRMap);
+          NewBB->push_back(NewMI);
+          InstrMap[NewMI] = &*BBI;
+        }
+      }
+    }
+    rewritePhiValues(NewBB, i, VRMap, InstrMap);
+    LLVM_DEBUG({
+      dbgs() << "prolog:\n";
+      NewBB->dump();
+    });
+  }
+
+  PredBB->replaceSuccessor(BB, KernelBB);
+
+  // Check if we need to remove the branch from the preheader to the original
+  // loop, and replace it with a branch to the new loop.
+  unsigned numBranches = TII->removeBranch(*Preheader);
+  if (numBranches) {
+    SmallVector<MachineOperand, 0> Cond;
+    TII->insertBranch(*Preheader, PrologBBs[0], nullptr, Cond, DebugLoc());
+  }
+}
+
+/// Generate the pipeline epilog code. The epilog code finishes the iterations
+/// that were started in either the prolog or the kernel.  We create a basic
+/// block for each stage that needs to complete.
+void ModuloScheduleExpander::generateEpilog(unsigned LastStage,
+                                            MachineBasicBlock *KernelBB,
+                                            ValueMapTy *VRMap,
+                                            MBBVectorTy &EpilogBBs,
+                                            MBBVectorTy &PrologBBs) {
+  // We need to change the branch from the kernel to the first epilog block, so
+  // this call to analyze branch uses the kernel rather than the original BB.
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  bool checkBranch = TII->analyzeBranch(*KernelBB, TBB, FBB, Cond);
+  assert(!checkBranch && "generateEpilog must be able to analyze the branch");
+  if (checkBranch)
+    return;
+
+  MachineBasicBlock::succ_iterator LoopExitI = KernelBB->succ_begin();
+  if (*LoopExitI == KernelBB)
+    ++LoopExitI;
+  assert(LoopExitI != KernelBB->succ_end() && "Expecting a successor");
+  MachineBasicBlock *LoopExitBB = *LoopExitI;
+
+  MachineBasicBlock *PredBB = KernelBB;
+  MachineBasicBlock *EpilogStart = LoopExitBB;
+  InstrMapTy InstrMap;
+
+  // Generate a basic block for each stage, not including the last stage,
+  // which was generated for the kernel.  Each basic block may contain
+  // instructions from multiple stages/iterations.
+  int EpilogStage = LastStage + 1;
+  for (unsigned i = LastStage; i >= 1; --i, ++EpilogStage) {
+    MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock();
+    EpilogBBs.push_back(NewBB);
+    MF.insert(BB->getIterator(), NewBB);
+
+    PredBB->replaceSuccessor(LoopExitBB, NewBB);
+    NewBB->addSuccessor(LoopExitBB);
+
+    if (EpilogStart == LoopExitBB)
+      EpilogStart = NewBB;
+
+    // Add instructions to the epilog depending on the current block.
+    // Process instructions in original program order.
+    for (unsigned StageNum = i; StageNum <= LastStage; ++StageNum) {
+      for (auto &BBI : *BB) {
+        if (BBI.isPHI())
+          continue;
+        MachineInstr *In = &BBI;
+        if ((unsigned)Schedule.getStage(In) == StageNum) {
+          // Instructions with memoperands in the epilog are updated with
+          // conservative values.
+          MachineInstr *NewMI = cloneInstr(In, UINT_MAX, 0);
+          updateInstruction(NewMI, i == 1, EpilogStage, 0, VRMap);
+          NewBB->push_back(NewMI);
+          InstrMap[NewMI] = In;
+        }
+      }
+    }
+    generateExistingPhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap,
+                         InstrMap, LastStage, EpilogStage, i == 1);
+    generatePhis(NewBB, PrologBBs[i - 1], PredBB, KernelBB, VRMap, InstrMap,
+                 LastStage, EpilogStage, i == 1);
+    PredBB = NewBB;
+
+    LLVM_DEBUG({
+      dbgs() << "epilog:\n";
+      NewBB->dump();
+    });
+  }
+
+  // Fix any Phi nodes in the loop exit block.
+  LoopExitBB->replacePhiUsesWith(BB, PredBB);
+
+  // Create a branch to the new epilog from the kernel.
+  // Remove the original branch and add a new branch to the epilog.
+  TII->removeBranch(*KernelBB);
+  TII->insertBranch(*KernelBB, KernelBB, EpilogStart, Cond, DebugLoc());
+  // Add a branch to the loop exit.
+  if (EpilogBBs.size() > 0) {
+    MachineBasicBlock *LastEpilogBB = EpilogBBs.back();
+    SmallVector<MachineOperand, 4> Cond1;
+    TII->insertBranch(*LastEpilogBB, LoopExitBB, nullptr, Cond1, DebugLoc());
+  }
+}
+
+/// Replace all uses of FromReg that appear outside the specified
+/// basic block with ToReg.
+static void replaceRegUsesAfterLoop(unsigned FromReg, unsigned ToReg,
+                                    MachineBasicBlock *MBB,
+                                    MachineRegisterInfo &MRI,
+                                    LiveIntervals &LIS) {
+  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(FromReg),
+                                         E = MRI.use_end();
+       I != E;) {
+    MachineOperand &O = *I;
+    ++I;
+    if (O.getParent()->getParent() != MBB)
+      O.setReg(ToReg);
+  }
+  if (!LIS.hasInterval(ToReg))
+    LIS.createEmptyInterval(ToReg);
+}
+
+/// Return true if the register has a use that occurs outside the
+/// specified loop.
+static bool hasUseAfterLoop(unsigned Reg, MachineBasicBlock *BB,
+                            MachineRegisterInfo &MRI) {
+  for (MachineRegisterInfo::use_iterator I = MRI.use_begin(Reg),
+                                         E = MRI.use_end();
+       I != E; ++I)
+    if (I->getParent()->getParent() != BB)
+      return true;
+  return false;
+}
+
+/// Generate Phis for the specific block in the generated pipelined code.
+/// This function looks at the Phis from the original code to guide the
+/// creation of new Phis.
+void ModuloScheduleExpander::generateExistingPhis(
+    MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
+    MachineBasicBlock *KernelBB, ValueMapTy *VRMap, InstrMapTy &InstrMap,
+    unsigned LastStageNum, unsigned CurStageNum, bool IsLast) {
+  // Compute the stage number for the initial value of the Phi, which
+  // comes from the prolog. The prolog to use depends on to which kernel/
+  // epilog that we're adding the Phi.
+  unsigned PrologStage = 0;
+  unsigned PrevStage = 0;
+  bool InKernel = (LastStageNum == CurStageNum);
+  if (InKernel) {
+    PrologStage = LastStageNum - 1;
+    PrevStage = CurStageNum;
+  } else {
+    PrologStage = LastStageNum - (CurStageNum - LastStageNum);
+    PrevStage = LastStageNum + (CurStageNum - LastStageNum) - 1;
+  }
+
+  for (MachineBasicBlock::iterator BBI = BB->instr_begin(),
+                                   BBE = BB->getFirstNonPHI();
+       BBI != BBE; ++BBI) {
+    Register Def = BBI->getOperand(0).getReg();
+
+    unsigned InitVal = 0;
+    unsigned LoopVal = 0;
+    getPhiRegs(*BBI, BB, InitVal, LoopVal);
+
+    unsigned PhiOp1 = 0;
+    // The Phi value from the loop body typically is defined in the loop, but
+    // not always. So, we need to check if the value is defined in the loop.
+    unsigned PhiOp2 = LoopVal;
+    if (VRMap[LastStageNum].count(LoopVal))
+      PhiOp2 = VRMap[LastStageNum][LoopVal];
+
+    int StageScheduled = Schedule.getStage(&*BBI);
+    int LoopValStage = Schedule.getStage(MRI.getVRegDef(LoopVal));
+    unsigned NumStages = getStagesForReg(Def, CurStageNum);
+    if (NumStages == 0) {
+      // We don't need to generate a Phi anymore, but we need to rename any uses
+      // of the Phi value.
+      unsigned NewReg = VRMap[PrevStage][LoopVal];
+      rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, 0, &*BBI, Def,
+                            InitVal, NewReg);
+      if (VRMap[CurStageNum].count(LoopVal))
+        VRMap[CurStageNum][Def] = VRMap[CurStageNum][LoopVal];
+    }
+    // Adjust the number of Phis needed depending on the number of prologs left,
+    // and the distance from where the Phi is first scheduled. The number of
+    // Phis cannot exceed the number of prolog stages. Each stage can
+    // potentially define two values.
+    unsigned MaxPhis = PrologStage + 2;
+    if (!InKernel && (int)PrologStage <= LoopValStage)
+      MaxPhis = std::max((int)MaxPhis - (int)LoopValStage, 1);
+    unsigned NumPhis = std::min(NumStages, MaxPhis);
+
+    unsigned NewReg = 0;
+    unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
+    // In the epilog, we may need to look back one stage to get the correct
+    // Phi name because the epilog and prolog blocks execute the same stage.
+    // The correct name is from the previous block only when the Phi has
+    // been completely scheduled prior to the epilog, and Phi value is not
+    // needed in multiple stages.
+    int StageDiff = 0;
+    if (!InKernel && StageScheduled >= LoopValStage && AccessStage == 0 &&
+        NumPhis == 1)
+      StageDiff = 1;
+    // Adjust the computations below when the phi and the loop definition
+    // are scheduled in different stages.
+    if (InKernel && LoopValStage != -1 && StageScheduled > LoopValStage)
+      StageDiff = StageScheduled - LoopValStage;
+    for (unsigned np = 0; np < NumPhis; ++np) {
+      // If the Phi hasn't been scheduled, then use the initial Phi operand
+      // value. Otherwise, use the scheduled version of the instruction. This
+      // is a little complicated when a Phi references another Phi.
+      if (np > PrologStage || StageScheduled >= (int)LastStageNum)
+        PhiOp1 = InitVal;
+      // Check if the Phi has already been scheduled in a prolog stage.
+      else if (PrologStage >= AccessStage + StageDiff + np &&
+               VRMap[PrologStage - StageDiff - np].count(LoopVal) != 0)
+        PhiOp1 = VRMap[PrologStage - StageDiff - np][LoopVal];
+      // Check if the Phi has already been scheduled, but the loop instruction
+      // is either another Phi, or doesn't occur in the loop.
+      else if (PrologStage >= AccessStage + StageDiff + np) {
+        // If the Phi references another Phi, we need to examine the other
+        // Phi to get the correct value.
+        PhiOp1 = LoopVal;
+        MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1);
+        int Indirects = 1;
+        while (InstOp1 && InstOp1->isPHI() && InstOp1->getParent() == BB) {
+          int PhiStage = Schedule.getStage(InstOp1);
+          if ((int)(PrologStage - StageDiff - np) < PhiStage + Indirects)
+            PhiOp1 = getInitPhiReg(*InstOp1, BB);
+          else
+            PhiOp1 = getLoopPhiReg(*InstOp1, BB);
+          InstOp1 = MRI.getVRegDef(PhiOp1);
+          int PhiOpStage = Schedule.getStage(InstOp1);
+          int StageAdj = (PhiOpStage != -1 ? PhiStage - PhiOpStage : 0);
+          if (PhiOpStage != -1 && PrologStage - StageAdj >= Indirects + np &&
+              VRMap[PrologStage - StageAdj - Indirects - np].count(PhiOp1)) {
+            PhiOp1 = VRMap[PrologStage - StageAdj - Indirects - np][PhiOp1];
+            break;
+          }
+          ++Indirects;
+        }
+      } else
+        PhiOp1 = InitVal;
+      // If this references a generated Phi in the kernel, get the Phi operand
+      // from the incoming block.
+      if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1))
+        if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
+          PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
+
+      MachineInstr *PhiInst = MRI.getVRegDef(LoopVal);
+      bool LoopDefIsPhi = PhiInst && PhiInst->isPHI();
+      // In the epilog, a map lookup is needed to get the value from the kernel,
+      // or previous epilog block. How is does this depends on if the
+      // instruction is scheduled in the previous block.
+      if (!InKernel) {
+        int StageDiffAdj = 0;
+        if (LoopValStage != -1 && StageScheduled > LoopValStage)
+          StageDiffAdj = StageScheduled - LoopValStage;
+        // Use the loop value defined in the kernel, unless the kernel
+        // contains the last definition of the Phi.
+        if (np == 0 && PrevStage == LastStageNum &&
+            (StageScheduled != 0 || LoopValStage != 0) &&
+            VRMap[PrevStage - StageDiffAdj].count(LoopVal))
+          PhiOp2 = VRMap[PrevStage - StageDiffAdj][LoopVal];
+        // Use the value defined by the Phi. We add one because we switch
+        // from looking at the loop value to the Phi definition.
+        else if (np > 0 && PrevStage == LastStageNum &&
+                 VRMap[PrevStage - np + 1].count(Def))
+          PhiOp2 = VRMap[PrevStage - np + 1][Def];
+        // Use the loop value defined in the kernel.
+        else if (static_cast<unsigned>(LoopValStage) > PrologStage + 1 &&
+                 VRMap[PrevStage - StageDiffAdj - np].count(LoopVal))
+          PhiOp2 = VRMap[PrevStage - StageDiffAdj - np][LoopVal];
+        // Use the value defined by the Phi, unless we're generating the first
+        // epilog and the Phi refers to a Phi in a different stage.
+        else if (VRMap[PrevStage - np].count(Def) &&
+                 (!LoopDefIsPhi || (PrevStage != LastStageNum) ||
+                  (LoopValStage == StageScheduled)))
+          PhiOp2 = VRMap[PrevStage - np][Def];
+      }
+
+      // Check if we can reuse an existing Phi. This occurs when a Phi
+      // references another Phi, and the other Phi is scheduled in an
+      // earlier stage. We can try to reuse an existing Phi up until the last
+      // stage of the current Phi.
+      if (LoopDefIsPhi) {
+        if (static_cast<int>(PrologStage - np) >= StageScheduled) {
+          int LVNumStages = getStagesForPhi(LoopVal);
+          int StageDiff = (StageScheduled - LoopValStage);
+          LVNumStages -= StageDiff;
+          // Make sure the loop value Phi has been processed already.
+          if (LVNumStages > (int)np && VRMap[CurStageNum].count(LoopVal)) {
+            NewReg = PhiOp2;
+            unsigned ReuseStage = CurStageNum;
+            if (isLoopCarried(*PhiInst))
+              ReuseStage -= LVNumStages;
+            // Check if the Phi to reuse has been generated yet. If not, then
+            // there is nothing to reuse.
+            if (VRMap[ReuseStage - np].count(LoopVal)) {
+              NewReg = VRMap[ReuseStage - np][LoopVal];
+
+              rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI,
+                                    Def, NewReg);
+              // Update the map with the new Phi name.
+              VRMap[CurStageNum - np][Def] = NewReg;
+              PhiOp2 = NewReg;
+              if (VRMap[LastStageNum - np - 1].count(LoopVal))
+                PhiOp2 = VRMap[LastStageNum - np - 1][LoopVal];
+
+              if (IsLast && np == NumPhis - 1)
+                replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
+              continue;
+            }
+          }
+        }
+        if (InKernel && StageDiff > 0 &&
+            VRMap[CurStageNum - StageDiff - np].count(LoopVal))
+          PhiOp2 = VRMap[CurStageNum - StageDiff - np][LoopVal];
+      }
+
+      const TargetRegisterClass *RC = MRI.getRegClass(Def);
+      NewReg = MRI.createVirtualRegister(RC);
+
+      MachineInstrBuilder NewPhi =
+          BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
+                  TII->get(TargetOpcode::PHI), NewReg);
+      NewPhi.addReg(PhiOp1).addMBB(BB1);
+      NewPhi.addReg(PhiOp2).addMBB(BB2);
+      if (np == 0)
+        InstrMap[NewPhi] = &*BBI;
+
+      // We define the Phis after creating the new pipelined code, so
+      // we need to rename the Phi values in scheduled instructions.
+
+      unsigned PrevReg = 0;
+      if (InKernel && VRMap[PrevStage - np].count(LoopVal))
+        PrevReg = VRMap[PrevStage - np][LoopVal];
+      rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
+                            NewReg, PrevReg);
+      // If the Phi has been scheduled, use the new name for rewriting.
+      if (VRMap[CurStageNum - np].count(Def)) {
+        unsigned R = VRMap[CurStageNum - np][Def];
+        rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, R,
+                              NewReg);
+      }
+
+      // Check if we need to rename any uses that occurs after the loop. The
+      // register to replace depends on whether the Phi is scheduled in the
+      // epilog.
+      if (IsLast && np == NumPhis - 1)
+        replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
+
+      // In the kernel, a dependent Phi uses the value from this Phi.
+      if (InKernel)
+        PhiOp2 = NewReg;
+
+      // Update the map with the new Phi name.
+      VRMap[CurStageNum - np][Def] = NewReg;
+    }
+
+    while (NumPhis++ < NumStages) {
+      rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, NumPhis, &*BBI, Def,
+                            NewReg, 0);
+    }
+
+    // Check if we need to rename a Phi that has been eliminated due to
+    // scheduling.
+    if (NumStages == 0 && IsLast && VRMap[CurStageNum].count(LoopVal))
+      replaceRegUsesAfterLoop(Def, VRMap[CurStageNum][LoopVal], BB, MRI, LIS);
+  }
+}
+
+/// Generate Phis for the specified block in the generated pipelined code.
+/// These are new Phis needed because the definition is scheduled after the
+/// use in the pipelined sequence.
+void ModuloScheduleExpander::generatePhis(
+    MachineBasicBlock *NewBB, MachineBasicBlock *BB1, MachineBasicBlock *BB2,
+    MachineBasicBlock *KernelBB, ValueMapTy *VRMap, InstrMapTy &InstrMap,
+    unsigned LastStageNum, unsigned CurStageNum, bool IsLast) {
+  // Compute the stage number that contains the initial Phi value, and
+  // the Phi from the previous stage.
+  unsigned PrologStage = 0;
+  unsigned PrevStage = 0;
+  unsigned StageDiff = CurStageNum - LastStageNum;
+  bool InKernel = (StageDiff == 0);
+  if (InKernel) {
+    PrologStage = LastStageNum - 1;
+    PrevStage = CurStageNum;
+  } else {
+    PrologStage = LastStageNum - StageDiff;
+    PrevStage = LastStageNum + StageDiff - 1;
+  }
+
+  for (MachineBasicBlock::iterator BBI = BB->getFirstNonPHI(),
+                                   BBE = BB->instr_end();
+       BBI != BBE; ++BBI) {
+    for (unsigned i = 0, e = BBI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = BBI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef() ||
+          !Register::isVirtualRegister(MO.getReg()))
+        continue;
+
+      int StageScheduled = Schedule.getStage(&*BBI);
+      assert(StageScheduled != -1 && "Expecting scheduled instruction.");
+      Register Def = MO.getReg();
+      unsigned NumPhis = getStagesForReg(Def, CurStageNum);
+      // An instruction scheduled in stage 0 and is used after the loop
+      // requires a phi in the epilog for the last definition from either
+      // the kernel or prolog.
+      if (!InKernel && NumPhis == 0 && StageScheduled == 0 &&
+          hasUseAfterLoop(Def, BB, MRI))
+        NumPhis = 1;
+      if (!InKernel && (unsigned)StageScheduled > PrologStage)
+        continue;
+
+      unsigned PhiOp2 = VRMap[PrevStage][Def];
+      if (MachineInstr *InstOp2 = MRI.getVRegDef(PhiOp2))
+        if (InstOp2->isPHI() && InstOp2->getParent() == NewBB)
+          PhiOp2 = getLoopPhiReg(*InstOp2, BB2);
+      // The number of Phis can't exceed the number of prolog stages. The
+      // prolog stage number is zero based.
+      if (NumPhis > PrologStage + 1 - StageScheduled)
+        NumPhis = PrologStage + 1 - StageScheduled;
+      for (unsigned np = 0; np < NumPhis; ++np) {
+        unsigned PhiOp1 = VRMap[PrologStage][Def];
+        if (np <= PrologStage)
+          PhiOp1 = VRMap[PrologStage - np][Def];
+        if (MachineInstr *InstOp1 = MRI.getVRegDef(PhiOp1)) {
+          if (InstOp1->isPHI() && InstOp1->getParent() == KernelBB)
+            PhiOp1 = getInitPhiReg(*InstOp1, KernelBB);
+          if (InstOp1->isPHI() && InstOp1->getParent() == NewBB)
+            PhiOp1 = getInitPhiReg(*InstOp1, NewBB);
+        }
+        if (!InKernel)
+          PhiOp2 = VRMap[PrevStage - np][Def];
+
+        const TargetRegisterClass *RC = MRI.getRegClass(Def);
+        Register NewReg = MRI.createVirtualRegister(RC);
+
+        MachineInstrBuilder NewPhi =
+            BuildMI(*NewBB, NewBB->getFirstNonPHI(), DebugLoc(),
+                    TII->get(TargetOpcode::PHI), NewReg);
+        NewPhi.addReg(PhiOp1).addMBB(BB1);
+        NewPhi.addReg(PhiOp2).addMBB(BB2);
+        if (np == 0)
+          InstrMap[NewPhi] = &*BBI;
+
+        // Rewrite uses and update the map. The actions depend upon whether
+        // we generating code for the kernel or epilog blocks.
+        if (InKernel) {
+          rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp1,
+                                NewReg);
+          rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, PhiOp2,
+                                NewReg);
+
+          PhiOp2 = NewReg;
+          VRMap[PrevStage - np - 1][Def] = NewReg;
+        } else {
+          VRMap[CurStageNum - np][Def] = NewReg;
+          if (np == NumPhis - 1)
+            rewriteScheduledInstr(NewBB, InstrMap, CurStageNum, np, &*BBI, Def,
+                                  NewReg);
+        }
+        if (IsLast && np == NumPhis - 1)
+          replaceRegUsesAfterLoop(Def, NewReg, BB, MRI, LIS);
+      }
+    }
+  }
+}
+
+/// Remove instructions that generate values with no uses.
+/// Typically, these are induction variable operations that generate values
+/// used in the loop itself.  A dead instruction has a definition with
+/// no uses, or uses that occur in the original loop only.
+void ModuloScheduleExpander::removeDeadInstructions(MachineBasicBlock *KernelBB,
+                                                    MBBVectorTy &EpilogBBs) {
+  // For each epilog block, check that the value defined by each instruction
+  // is used.  If not, delete it.
+  for (MBBVectorTy::reverse_iterator MBB = EpilogBBs.rbegin(),
+                                     MBE = EpilogBBs.rend();
+       MBB != MBE; ++MBB)
+    for (MachineBasicBlock::reverse_instr_iterator MI = (*MBB)->instr_rbegin(),
+                                                   ME = (*MBB)->instr_rend();
+         MI != ME;) {
+      // From DeadMachineInstructionElem. Don't delete inline assembly.
+      if (MI->isInlineAsm()) {
+        ++MI;
+        continue;
+      }
+      bool SawStore = false;
+      // Check if it's safe to remove the instruction due to side effects.
+      // We can, and want to, remove Phis here.
+      if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI()) {
+        ++MI;
+        continue;
+      }
+      bool used = true;
+      for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
+                                      MOE = MI->operands_end();
+           MOI != MOE; ++MOI) {
+        if (!MOI->isReg() || !MOI->isDef())
+          continue;
+        Register reg = MOI->getReg();
+        // Assume physical registers are used, unless they are marked dead.
+        if (Register::isPhysicalRegister(reg)) {
+          used = !MOI->isDead();
+          if (used)
+            break;
+          continue;
+        }
+        unsigned realUses = 0;
+        for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(reg),
+                                               EI = MRI.use_end();
+             UI != EI; ++UI) {
+          // Check if there are any uses that occur only in the original
+          // loop.  If so, that's not a real use.
+          if (UI->getParent()->getParent() != BB) {
+            realUses++;
+            used = true;
+            break;
+          }
+        }
+        if (realUses > 0)
+          break;
+        used = false;
+      }
+      if (!used) {
+        LIS.RemoveMachineInstrFromMaps(*MI);
+        MI++->eraseFromParent();
+        continue;
+      }
+      ++MI;
+    }
+  // In the kernel block, check if we can remove a Phi that generates a value
+  // used in an instruction removed in the epilog block.
+  for (MachineBasicBlock::iterator BBI = KernelBB->instr_begin(),
+                                   BBE = KernelBB->getFirstNonPHI();
+       BBI != BBE;) {
+    MachineInstr *MI = &*BBI;
+    ++BBI;
+    Register reg = MI->getOperand(0).getReg();
+    if (MRI.use_begin(reg) == MRI.use_end()) {
+      LIS.RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+    }
+  }
+}
+
+/// For loop carried definitions, we split the lifetime of a virtual register
+/// that has uses past the definition in the next iteration. A copy with a new
+/// virtual register is inserted before the definition, which helps with
+/// generating a better register assignment.
+///
+///   v1 = phi(a, v2)     v1 = phi(a, v2)
+///   v2 = phi(b, v3)     v2 = phi(b, v3)
+///   v3 = ..             v4 = copy v1
+///   .. = V1             v3 = ..
+///                       .. = v4
+void ModuloScheduleExpander::splitLifetimes(MachineBasicBlock *KernelBB,
+                                            MBBVectorTy &EpilogBBs) {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  for (auto &PHI : KernelBB->phis()) {
+    Register Def = PHI.getOperand(0).getReg();
+    // Check for any Phi definition that used as an operand of another Phi
+    // in the same block.
+    for (MachineRegisterInfo::use_instr_iterator I = MRI.use_instr_begin(Def),
+                                                 E = MRI.use_instr_end();
+         I != E; ++I) {
+      if (I->isPHI() && I->getParent() == KernelBB) {
+        // Get the loop carried definition.
+        unsigned LCDef = getLoopPhiReg(PHI, KernelBB);
+        if (!LCDef)
+          continue;
+        MachineInstr *MI = MRI.getVRegDef(LCDef);
+        if (!MI || MI->getParent() != KernelBB || MI->isPHI())
+          continue;
+        // Search through the rest of the block looking for uses of the Phi
+        // definition. If one occurs, then split the lifetime.
+        unsigned SplitReg = 0;
+        for (auto &BBJ : make_range(MachineBasicBlock::instr_iterator(MI),
+                                    KernelBB->instr_end()))
+          if (BBJ.readsRegister(Def)) {
+            // We split the lifetime when we find the first use.
+            if (SplitReg == 0) {
+              SplitReg = MRI.createVirtualRegister(MRI.getRegClass(Def));
+              BuildMI(*KernelBB, MI, MI->getDebugLoc(),
+                      TII->get(TargetOpcode::COPY), SplitReg)
+                  .addReg(Def);
+            }
+            BBJ.substituteRegister(Def, SplitReg, 0, *TRI);
+          }
+        if (!SplitReg)
+          continue;
+        // Search through each of the epilog blocks for any uses to be renamed.
+        for (auto &Epilog : EpilogBBs)
+          for (auto &I : *Epilog)
+            if (I.readsRegister(Def))
+              I.substituteRegister(Def, SplitReg, 0, *TRI);
+        break;
+      }
+    }
+  }
+}
+
+/// Remove the incoming block from the Phis in a basic block.
+static void removePhis(MachineBasicBlock *BB, MachineBasicBlock *Incoming) {
+  for (MachineInstr &MI : *BB) {
+    if (!MI.isPHI())
+      break;
+    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2)
+      if (MI.getOperand(i + 1).getMBB() == Incoming) {
+        MI.RemoveOperand(i + 1);
+        MI.RemoveOperand(i);
+        break;
+      }
+  }
+}
+
+/// Create branches from each prolog basic block to the appropriate epilog
+/// block.  These edges are needed if the loop ends before reaching the
+/// kernel.
+void ModuloScheduleExpander::addBranches(MachineBasicBlock &PreheaderBB,
+                                         MBBVectorTy &PrologBBs,
+                                         MachineBasicBlock *KernelBB,
+                                         MBBVectorTy &EpilogBBs,
+                                         ValueMapTy *VRMap) {
+  assert(PrologBBs.size() == EpilogBBs.size() && "Prolog/Epilog mismatch");
+  MachineBasicBlock *LastPro = KernelBB;
+  MachineBasicBlock *LastEpi = KernelBB;
+
+  // Start from the blocks connected to the kernel and work "out"
+  // to the first prolog and the last epilog blocks.
+  SmallVector<MachineInstr *, 4> PrevInsts;
+  unsigned MaxIter = PrologBBs.size() - 1;
+  for (unsigned i = 0, j = MaxIter; i <= MaxIter; ++i, --j) {
+    // Add branches to the prolog that go to the corresponding
+    // epilog, and the fall-thru prolog/kernel block.
+    MachineBasicBlock *Prolog = PrologBBs[j];
+    MachineBasicBlock *Epilog = EpilogBBs[i];
+
+    SmallVector<MachineOperand, 4> Cond;
+    Optional<bool> StaticallyGreater =
+        LoopInfo->createTripCountGreaterCondition(j + 1, *Prolog, Cond);
+    unsigned numAdded = 0;
+    if (!StaticallyGreater.hasValue()) {
+      Prolog->addSuccessor(Epilog);
+      numAdded = TII->insertBranch(*Prolog, Epilog, LastPro, Cond, DebugLoc());
+    } else if (*StaticallyGreater == false) {
+      Prolog->addSuccessor(Epilog);
+      Prolog->removeSuccessor(LastPro);
+      LastEpi->removeSuccessor(Epilog);
+      numAdded = TII->insertBranch(*Prolog, Epilog, nullptr, Cond, DebugLoc());
+      removePhis(Epilog, LastEpi);
+      // Remove the blocks that are no longer referenced.
+      if (LastPro != LastEpi) {
+        LastEpi->clear();
+        LastEpi->eraseFromParent();
+      }
+      if (LastPro == KernelBB) {
+        LoopInfo->disposed();
+        NewKernel = nullptr;
+      }
+      LastPro->clear();
+      LastPro->eraseFromParent();
+    } else {
+      numAdded = TII->insertBranch(*Prolog, LastPro, nullptr, Cond, DebugLoc());
+      removePhis(Epilog, Prolog);
+    }
+    LastPro = Prolog;
+    LastEpi = Epilog;
+    for (MachineBasicBlock::reverse_instr_iterator I = Prolog->instr_rbegin(),
+                                                   E = Prolog->instr_rend();
+         I != E && numAdded > 0; ++I, --numAdded)
+      updateInstruction(&*I, false, j, 0, VRMap);
+  }
+
+  if (NewKernel) {
+    LoopInfo->setPreheader(PrologBBs[MaxIter]);
+    LoopInfo->adjustTripCount(-(MaxIter + 1));
+  }
+}
+
+/// Return true if we can compute the amount the instruction changes
+/// during each iteration. Set Delta to the amount of the change.
+bool ModuloScheduleExpander::computeDelta(MachineInstr &MI, unsigned &Delta) {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const MachineOperand *BaseOp;
+  int64_t Offset;
+  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI))
+    return false;
+
+  if (!BaseOp->isReg())
+    return false;
+
+  Register BaseReg = BaseOp->getReg();
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  // Check if there is a Phi. If so, get the definition in the loop.
+  MachineInstr *BaseDef = MRI.getVRegDef(BaseReg);
+  if (BaseDef && BaseDef->isPHI()) {
+    BaseReg = getLoopPhiReg(*BaseDef, MI.getParent());
+    BaseDef = MRI.getVRegDef(BaseReg);
+  }
+  if (!BaseDef)
+    return false;
+
+  int D = 0;
+  if (!TII->getIncrementValue(*BaseDef, D) && D >= 0)
+    return false;
+
+  Delta = D;
+  return true;
+}
+
+/// Update the memory operand with a new offset when the pipeliner
+/// generates a new copy of the instruction that refers to a
+/// different memory location.
+void ModuloScheduleExpander::updateMemOperands(MachineInstr &NewMI,
+                                               MachineInstr &OldMI,
+                                               unsigned Num) {
+  if (Num == 0)
+    return;
+  // If the instruction has memory operands, then adjust the offset
+  // when the instruction appears in different stages.
+  if (NewMI.memoperands_empty())
+    return;
+  SmallVector<MachineMemOperand *, 2> NewMMOs;
+  for (MachineMemOperand *MMO : NewMI.memoperands()) {
+    // TODO: Figure out whether isAtomic is really necessary (see D57601).
+    if (MMO->isVolatile() || MMO->isAtomic() ||
+        (MMO->isInvariant() && MMO->isDereferenceable()) ||
+        (!MMO->getValue())) {
+      NewMMOs.push_back(MMO);
+      continue;
+    }
+    unsigned Delta;
+    if (Num != UINT_MAX && computeDelta(OldMI, Delta)) {
+      int64_t AdjOffset = Delta * Num;
+      NewMMOs.push_back(
+          MF.getMachineMemOperand(MMO, AdjOffset, MMO->getSize()));
+    } else {
+      NewMMOs.push_back(
+          MF.getMachineMemOperand(MMO, 0, MemoryLocation::UnknownSize));
+    }
+  }
+  NewMI.setMemRefs(MF, NewMMOs);
+}
+
+/// Clone the instruction for the new pipelined loop and update the
+/// memory operands, if needed.
+MachineInstr *ModuloScheduleExpander::cloneInstr(MachineInstr *OldMI,
+                                                 unsigned CurStageNum,
+                                                 unsigned InstStageNum) {
+  MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
+  // Check for tied operands in inline asm instructions. This should be handled
+  // elsewhere, but I'm not sure of the best solution.
+  if (OldMI->isInlineAsm())
+    for (unsigned i = 0, e = OldMI->getNumOperands(); i != e; ++i) {
+      const auto &MO = OldMI->getOperand(i);
+      if (MO.isReg() && MO.isUse())
+        break;
+      unsigned UseIdx;
+      if (OldMI->isRegTiedToUseOperand(i, &UseIdx))
+        NewMI->tieOperands(i, UseIdx);
+    }
+  updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
+  return NewMI;
+}
+
+/// Clone the instruction for the new pipelined loop. If needed, this
+/// function updates the instruction using the values saved in the
+/// InstrChanges structure.
+MachineInstr *ModuloScheduleExpander::cloneAndChangeInstr(
+    MachineInstr *OldMI, unsigned CurStageNum, unsigned InstStageNum) {
+  MachineInstr *NewMI = MF.CloneMachineInstr(OldMI);
+  auto It = InstrChanges.find(OldMI);
+  if (It != InstrChanges.end()) {
+    std::pair<unsigned, int64_t> RegAndOffset = It->second;
+    unsigned BasePos, OffsetPos;
+    if (!TII->getBaseAndOffsetPosition(*OldMI, BasePos, OffsetPos))
+      return nullptr;
+    int64_t NewOffset = OldMI->getOperand(OffsetPos).getImm();
+    MachineInstr *LoopDef = findDefInLoop(RegAndOffset.first);
+    if (Schedule.getStage(LoopDef) > (signed)InstStageNum)
+      NewOffset += RegAndOffset.second * (CurStageNum - InstStageNum);
+    NewMI->getOperand(OffsetPos).setImm(NewOffset);
+  }
+  updateMemOperands(*NewMI, *OldMI, CurStageNum - InstStageNum);
+  return NewMI;
+}
+
+/// Update the machine instruction with new virtual registers.  This
+/// function may change the defintions and/or uses.
+void ModuloScheduleExpander::updateInstruction(MachineInstr *NewMI,
+                                               bool LastDef,
+                                               unsigned CurStageNum,
+                                               unsigned InstrStageNum,
+                                               ValueMapTy *VRMap) {
+  for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = NewMI->getOperand(i);
+    if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
+      continue;
+    Register reg = MO.getReg();
+    if (MO.isDef()) {
+      // Create a new virtual register for the definition.
+      const TargetRegisterClass *RC = MRI.getRegClass(reg);
+      Register NewReg = MRI.createVirtualRegister(RC);
+      MO.setReg(NewReg);
+      VRMap[CurStageNum][reg] = NewReg;
+      if (LastDef)
+        replaceRegUsesAfterLoop(reg, NewReg, BB, MRI, LIS);
+    } else if (MO.isUse()) {
+      MachineInstr *Def = MRI.getVRegDef(reg);
+      // Compute the stage that contains the last definition for instruction.
+      int DefStageNum = Schedule.getStage(Def);
+      unsigned StageNum = CurStageNum;
+      if (DefStageNum != -1 && (int)InstrStageNum > DefStageNum) {
+        // Compute the difference in stages between the defintion and the use.
+        unsigned StageDiff = (InstrStageNum - DefStageNum);
+        // Make an adjustment to get the last definition.
+        StageNum -= StageDiff;
+      }
+      if (VRMap[StageNum].count(reg))
+        MO.setReg(VRMap[StageNum][reg]);
+    }
+  }
+}
+
+/// Return the instruction in the loop that defines the register.
+/// If the definition is a Phi, then follow the Phi operand to
+/// the instruction in the loop.
+MachineInstr *ModuloScheduleExpander::findDefInLoop(unsigned Reg) {
+  SmallPtrSet<MachineInstr *, 8> Visited;
+  MachineInstr *Def = MRI.getVRegDef(Reg);
+  while (Def->isPHI()) {
+    if (!Visited.insert(Def).second)
+      break;
+    for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2)
+      if (Def->getOperand(i + 1).getMBB() == BB) {
+        Def = MRI.getVRegDef(Def->getOperand(i).getReg());
+        break;
+      }
+  }
+  return Def;
+}
+
+/// Return the new name for the value from the previous stage.
+unsigned ModuloScheduleExpander::getPrevMapVal(
+    unsigned StageNum, unsigned PhiStage, unsigned LoopVal, unsigned LoopStage,
+    ValueMapTy *VRMap, MachineBasicBlock *BB) {
+  unsigned PrevVal = 0;
+  if (StageNum > PhiStage) {
+    MachineInstr *LoopInst = MRI.getVRegDef(LoopVal);
+    if (PhiStage == LoopStage && VRMap[StageNum - 1].count(LoopVal))
+      // The name is defined in the previous stage.
+      PrevVal = VRMap[StageNum - 1][LoopVal];
+    else if (VRMap[StageNum].count(LoopVal))
+      // The previous name is defined in the current stage when the instruction
+      // order is swapped.
+      PrevVal = VRMap[StageNum][LoopVal];
+    else if (!LoopInst->isPHI() || LoopInst->getParent() != BB)
+      // The loop value hasn't yet been scheduled.
+      PrevVal = LoopVal;
+    else if (StageNum == PhiStage + 1)
+      // The loop value is another phi, which has not been scheduled.
+      PrevVal = getInitPhiReg(*LoopInst, BB);
+    else if (StageNum > PhiStage + 1 && LoopInst->getParent() == BB)
+      // The loop value is another phi, which has been scheduled.
+      PrevVal =
+          getPrevMapVal(StageNum - 1, PhiStage, getLoopPhiReg(*LoopInst, BB),
+                        LoopStage, VRMap, BB);
+  }
+  return PrevVal;
+}
+
+/// Rewrite the Phi values in the specified block to use the mappings
+/// from the initial operand. Once the Phi is scheduled, we switch
+/// to using the loop value instead of the Phi value, so those names
+/// do not need to be rewritten.
+void ModuloScheduleExpander::rewritePhiValues(MachineBasicBlock *NewBB,
+                                              unsigned StageNum,
+                                              ValueMapTy *VRMap,
+                                              InstrMapTy &InstrMap) {
+  for (auto &PHI : BB->phis()) {
+    unsigned InitVal = 0;
+    unsigned LoopVal = 0;
+    getPhiRegs(PHI, BB, InitVal, LoopVal);
+    Register PhiDef = PHI.getOperand(0).getReg();
+
+    unsigned PhiStage = (unsigned)Schedule.getStage(MRI.getVRegDef(PhiDef));
+    unsigned LoopStage = (unsigned)Schedule.getStage(MRI.getVRegDef(LoopVal));
+    unsigned NumPhis = getStagesForPhi(PhiDef);
+    if (NumPhis > StageNum)
+      NumPhis = StageNum;
+    for (unsigned np = 0; np <= NumPhis; ++np) {
+      unsigned NewVal =
+          getPrevMapVal(StageNum - np, PhiStage, LoopVal, LoopStage, VRMap, BB);
+      if (!NewVal)
+        NewVal = InitVal;
+      rewriteScheduledInstr(NewBB, InstrMap, StageNum - np, np, &PHI, PhiDef,
+                            NewVal);
+    }
+  }
+}
+
+/// Rewrite a previously scheduled instruction to use the register value
+/// from the new instruction. Make sure the instruction occurs in the
+/// basic block, and we don't change the uses in the new instruction.
+void ModuloScheduleExpander::rewriteScheduledInstr(
+    MachineBasicBlock *BB, InstrMapTy &InstrMap, unsigned CurStageNum,
+    unsigned PhiNum, MachineInstr *Phi, unsigned OldReg, unsigned NewReg,
+    unsigned PrevReg) {
+  bool InProlog = (CurStageNum < (unsigned)Schedule.getNumStages() - 1);
+  int StagePhi = Schedule.getStage(Phi) + PhiNum;
+  // Rewrite uses that have been scheduled already to use the new
+  // Phi register.
+  for (MachineRegisterInfo::use_iterator UI = MRI.use_begin(OldReg),
+                                         EI = MRI.use_end();
+       UI != EI;) {
+    MachineOperand &UseOp = *UI;
+    MachineInstr *UseMI = UseOp.getParent();
+    ++UI;
+    if (UseMI->getParent() != BB)
+      continue;
+    if (UseMI->isPHI()) {
+      if (!Phi->isPHI() && UseMI->getOperand(0).getReg() == NewReg)
+        continue;
+      if (getLoopPhiReg(*UseMI, BB) != OldReg)
+        continue;
+    }
+    InstrMapTy::iterator OrigInstr = InstrMap.find(UseMI);
+    assert(OrigInstr != InstrMap.end() && "Instruction not scheduled.");
+    MachineInstr *OrigMI = OrigInstr->second;
+    int StageSched = Schedule.getStage(OrigMI);
+    int CycleSched = Schedule.getCycle(OrigMI);
+    unsigned ReplaceReg = 0;
+    // This is the stage for the scheduled instruction.
+    if (StagePhi == StageSched && Phi->isPHI()) {
+      int CyclePhi = Schedule.getCycle(Phi);
+      if (PrevReg && InProlog)
+        ReplaceReg = PrevReg;
+      else if (PrevReg && !isLoopCarried(*Phi) &&
+               (CyclePhi <= CycleSched || OrigMI->isPHI()))
+        ReplaceReg = PrevReg;
+      else
+        ReplaceReg = NewReg;
+    }
+    // The scheduled instruction occurs before the scheduled Phi, and the
+    // Phi is not loop carried.
+    if (!InProlog && StagePhi + 1 == StageSched && !isLoopCarried(*Phi))
+      ReplaceReg = NewReg;
+    if (StagePhi > StageSched && Phi->isPHI())
+      ReplaceReg = NewReg;
+    if (!InProlog && !Phi->isPHI() && StagePhi < StageSched)
+      ReplaceReg = NewReg;
+    if (ReplaceReg) {
+      MRI.constrainRegClass(ReplaceReg, MRI.getRegClass(OldReg));
+      UseOp.setReg(ReplaceReg);
+    }
+  }
+}
+
+bool ModuloScheduleExpander::isLoopCarried(MachineInstr &Phi) {
+  if (!Phi.isPHI())
+    return false;
+  unsigned DefCycle = Schedule.getCycle(&Phi);
+  int DefStage = Schedule.getStage(&Phi);
+
+  unsigned InitVal = 0;
+  unsigned LoopVal = 0;
+  getPhiRegs(Phi, Phi.getParent(), InitVal, LoopVal);
+  MachineInstr *Use = MRI.getVRegDef(LoopVal);
+  if (!Use || Use->isPHI())
+    return true;
+  unsigned LoopCycle = Schedule.getCycle(Use);
+  int LoopStage = Schedule.getStage(Use);
+  return (LoopCycle > DefCycle) || (LoopStage <= DefStage);
+}
+
+//===----------------------------------------------------------------------===//
+// PeelingModuloScheduleExpander implementation
+//===----------------------------------------------------------------------===//
+// This is a reimplementation of ModuloScheduleExpander that works by creating
+// a fully correct steady-state kernel and peeling off the prolog and epilogs.
+//===----------------------------------------------------------------------===//
+
+namespace {
+// Remove any dead phis in MBB. Dead phis either have only one block as input
+// (in which case they are the identity) or have no uses.
+void EliminateDeadPhis(MachineBasicBlock *MBB, MachineRegisterInfo &MRI,
+                       LiveIntervals *LIS) {
+  bool Changed = true;
+  while (Changed) {
+    Changed = false;
+    for (auto I = MBB->begin(); I != MBB->getFirstNonPHI();) {
+      MachineInstr &MI = *I++;
+      assert(MI.isPHI());
+      if (MRI.use_empty(MI.getOperand(0).getReg())) {
+        if (LIS)
+          LIS->RemoveMachineInstrFromMaps(MI);
+        MI.eraseFromParent();
+        Changed = true;
+      } else if (MI.getNumExplicitOperands() == 3) {
+        MRI.constrainRegClass(MI.getOperand(1).getReg(),
+                              MRI.getRegClass(MI.getOperand(0).getReg()));
+        MRI.replaceRegWith(MI.getOperand(0).getReg(),
+                           MI.getOperand(1).getReg());
+        if (LIS)
+          LIS->RemoveMachineInstrFromMaps(MI);
+        MI.eraseFromParent();
+        Changed = true;
+      }
+    }
+  }
+}
+
+/// Rewrites the kernel block in-place to adhere to the given schedule.
+/// KernelRewriter holds all of the state required to perform the rewriting.
+class KernelRewriter {
+  ModuloSchedule &S;
+  MachineBasicBlock *BB;
+  MachineBasicBlock *PreheaderBB, *ExitBB;
+  MachineRegisterInfo &MRI;
+  const TargetInstrInfo *TII;
+  LiveIntervals *LIS;
+
+  // Map from register class to canonical undef register for that class.
+  DenseMap<const TargetRegisterClass *, Register> Undefs;
+  // Map from <LoopReg, InitReg> to phi register for all created phis. Note that
+  // this map is only used when InitReg is non-undef.
+  DenseMap<std::pair<unsigned, unsigned>, Register> Phis;
+  // Map from LoopReg to phi register where the InitReg is undef.
+  DenseMap<Register, Register> UndefPhis;
+
+  // Reg is used by MI. Return the new register MI should use to adhere to the
+  // schedule. Insert phis as necessary.
+  Register remapUse(Register Reg, MachineInstr &MI);
+  // Insert a phi that carries LoopReg from the loop body and InitReg otherwise.
+  // If InitReg is not given it is chosen arbitrarily. It will either be undef
+  // or will be chosen so as to share another phi.
+  Register phi(Register LoopReg, Optional<Register> InitReg = {},
+               const TargetRegisterClass *RC = nullptr);
+  // Create an undef register of the given register class.
+  Register undef(const TargetRegisterClass *RC);
+
+public:
+  KernelRewriter(MachineLoop &L, ModuloSchedule &S,
+                 LiveIntervals *LIS = nullptr);
+  void rewrite();
+};
+} // namespace
+
+KernelRewriter::KernelRewriter(MachineLoop &L, ModuloSchedule &S,
+                               LiveIntervals *LIS)
+    : S(S), BB(L.getTopBlock()), PreheaderBB(L.getLoopPreheader()),
+      ExitBB(L.getExitBlock()), MRI(BB->getParent()->getRegInfo()),
+      TII(BB->getParent()->getSubtarget().getInstrInfo()), LIS(LIS) {
+  PreheaderBB = *BB->pred_begin();
+  if (PreheaderBB == BB)
+    PreheaderBB = *std::next(BB->pred_begin());
+}
+
+void KernelRewriter::rewrite() {
+  // Rearrange the loop to be in schedule order. Note that the schedule may
+  // contain instructions that are not owned by the loop block (InstrChanges and
+  // friends), so we gracefully handle unowned instructions and delete any
+  // instructions that weren't in the schedule.
+  auto InsertPt = BB->getFirstTerminator();
+  MachineInstr *FirstMI = nullptr;
+  for (MachineInstr *MI : S.getInstructions()) {
+    if (MI->isPHI())
+      continue;
+    if (MI->getParent())
+      MI->removeFromParent();
+    BB->insert(InsertPt, MI);
+    if (!FirstMI)
+      FirstMI = MI;
+  }
+  assert(FirstMI && "Failed to find first MI in schedule");
+
+  // At this point all of the scheduled instructions are between FirstMI
+  // and the end of the block. Kill from the first non-phi to FirstMI.
+  for (auto I = BB->getFirstNonPHI(); I != FirstMI->getIterator();) {
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(*I);
+    (I++)->eraseFromParent();
+  }
+
+  // Now remap every instruction in the loop.
+  for (MachineInstr &MI : *BB) {
+    if (MI.isPHI() || MI.isTerminator())
+      continue;
+    for (MachineOperand &MO : MI.uses()) {
+      if (!MO.isReg() || MO.getReg().isPhysical() || MO.isImplicit())
+        continue;
+      Register Reg = remapUse(MO.getReg(), MI);
+      MO.setReg(Reg);
+    }
+  }
+  EliminateDeadPhis(BB, MRI, LIS);
+
+  // Ensure a phi exists for all instructions that are either referenced by
+  // an illegal phi or by an instruction outside the loop. This allows us to
+  // treat remaps of these values the same as "normal" values that come from
+  // loop-carried phis.
+  for (auto MI = BB->getFirstNonPHI(); MI != BB->end(); ++MI) {
+    if (MI->isPHI()) {
+      Register R = MI->getOperand(0).getReg();
+      phi(R);
+      continue;
+    }
+
+    for (MachineOperand &Def : MI->defs()) {
+      for (MachineInstr &MI : MRI.use_instructions(Def.getReg())) {
+        if (MI.getParent() != BB) {
+          phi(Def.getReg());
+          break;
+        }
+      }
+    }
+  }
+}
+
+Register KernelRewriter::remapUse(Register Reg, MachineInstr &MI) {
+  MachineInstr *Producer = MRI.getUniqueVRegDef(Reg);
+  if (!Producer)
+    return Reg;
+
+  int ConsumerStage = S.getStage(&MI);
+  if (!Producer->isPHI()) {
+    // Non-phi producers are simple to remap. Insert as many phis as the
+    // difference between the consumer and producer stages.
+    if (Producer->getParent() != BB)
+      // Producer was not inside the loop. Use the register as-is.
+      return Reg;
+    int ProducerStage = S.getStage(Producer);
+    assert(ConsumerStage != -1 &&
+           "In-loop consumer should always be scheduled!");
+    assert(ConsumerStage >= ProducerStage);
+    unsigned StageDiff = ConsumerStage - ProducerStage;
+
+    for (unsigned I = 0; I < StageDiff; ++I)
+      Reg = phi(Reg);
+    return Reg;
+  }
+
+  // First, dive through the phi chain to find the defaults for the generated
+  // phis.
+  SmallVector<Optional<Register>, 4> Defaults;
+  Register LoopReg = Reg;
+  auto LoopProducer = Producer;
+  while (LoopProducer->isPHI() && LoopProducer->getParent() == BB) {
+    LoopReg = getLoopPhiReg(*LoopProducer, BB);
+    Defaults.emplace_back(getInitPhiReg(*LoopProducer, BB));
+    LoopProducer = MRI.getUniqueVRegDef(LoopReg);
+    assert(LoopProducer);
+  }
+  int LoopProducerStage = S.getStage(LoopProducer);
+
+  Optional<Register> IllegalPhiDefault;
+
+  if (LoopProducerStage == -1) {
+    // Do nothing.
+  } else if (LoopProducerStage > ConsumerStage) {
+    // This schedule is only representable if ProducerStage == ConsumerStage+1.
+    // In addition, Consumer's cycle must be scheduled after Producer in the
+    // rescheduled loop. This is enforced by the pipeliner's ASAP and ALAP
+    // functions.
+#ifndef NDEBUG // Silence unused variables in non-asserts mode.
+    int LoopProducerCycle = S.getCycle(LoopProducer);
+    int ConsumerCycle = S.getCycle(&MI);
+#endif
+    assert(LoopProducerCycle <= ConsumerCycle);
+    assert(LoopProducerStage == ConsumerStage + 1);
+    // Peel off the first phi from Defaults and insert a phi between producer
+    // and consumer. This phi will not be at the front of the block so we
+    // consider it illegal. It will only exist during the rewrite process; it
+    // needs to exist while we peel off prologs because these could take the
+    // default value. After that we can replace all uses with the loop producer
+    // value.
+    IllegalPhiDefault = Defaults.front();
+    Defaults.erase(Defaults.begin());
+  } else {
+    assert(ConsumerStage >= LoopProducerStage);
+    int StageDiff = ConsumerStage - LoopProducerStage;
+    if (StageDiff > 0) {
+      LLVM_DEBUG(dbgs() << " -- padding defaults array from " << Defaults.size()
+                        << " to " << (Defaults.size() + StageDiff) << "\n");
+      // If we need more phis than we have defaults for, pad out with undefs for
+      // the earliest phis, which are at the end of the defaults chain (the
+      // chain is in reverse order).
+      Defaults.resize(Defaults.size() + StageDiff, Defaults.empty()
+                                                       ? Optional<Register>()
+                                                       : Defaults.back());
+    }
+  }
+
+  // Now we know the number of stages to jump back, insert the phi chain.
+  auto DefaultI = Defaults.rbegin();
+  while (DefaultI != Defaults.rend())
+    LoopReg = phi(LoopReg, *DefaultI++, MRI.getRegClass(Reg));
+
+  if (IllegalPhiDefault.hasValue()) {
+    // The consumer optionally consumes LoopProducer in the same iteration
+    // (because the producer is scheduled at an earlier cycle than the consumer)
+    // or the initial value. To facilitate this we create an illegal block here
+    // by embedding a phi in the middle of the block. We will fix this up
+    // immediately prior to pruning.
+    auto RC = MRI.getRegClass(Reg);
+    Register R = MRI.createVirtualRegister(RC);
+    BuildMI(*BB, MI, DebugLoc(), TII->get(TargetOpcode::PHI), R)
+        .addReg(IllegalPhiDefault.getValue())
+        .addMBB(PreheaderBB) // Block choice is arbitrary and has no effect.
+        .addReg(LoopReg)
+        .addMBB(BB); // Block choice is arbitrary and has no effect.
+    return R;
+  }
+
+  return LoopReg;
+}
+
+Register KernelRewriter::phi(Register LoopReg, Optional<Register> InitReg,
+                             const TargetRegisterClass *RC) {
+  // If the init register is not undef, try and find an existing phi.
+  if (InitReg.hasValue()) {
+    auto I = Phis.find({LoopReg, InitReg.getValue()});
+    if (I != Phis.end())
+      return I->second;
+  } else {
+    for (auto &KV : Phis) {
+      if (KV.first.first == LoopReg)
+        return KV.second;
+    }
+  }
+
+  // InitReg is either undef or no existing phi takes InitReg as input. Try and
+  // find a phi that takes undef as input.
+  auto I = UndefPhis.find(LoopReg);
+  if (I != UndefPhis.end()) {
+    Register R = I->second;
+    if (!InitReg.hasValue())
+      // Found a phi taking undef as input, and this input is undef so return
+      // without any more changes.
+      return R;
+    // Found a phi taking undef as input, so rewrite it to take InitReg.
+    MachineInstr *MI = MRI.getVRegDef(R);
+    MI->getOperand(1).setReg(InitReg.getValue());
+    Phis.insert({{LoopReg, InitReg.getValue()}, R});
+    MRI.constrainRegClass(R, MRI.getRegClass(InitReg.getValue()));
+    UndefPhis.erase(I);
+    return R;
+  }
+
+  // Failed to find any existing phi to reuse, so create a new one.
+  if (!RC)
+    RC = MRI.getRegClass(LoopReg);
+  Register R = MRI.createVirtualRegister(RC);
+  if (InitReg.hasValue())
+    MRI.constrainRegClass(R, MRI.getRegClass(*InitReg));
+  BuildMI(*BB, BB->getFirstNonPHI(), DebugLoc(), TII->get(TargetOpcode::PHI), R)
+      .addReg(InitReg.hasValue() ? *InitReg : undef(RC))
+      .addMBB(PreheaderBB)
+      .addReg(LoopReg)
+      .addMBB(BB);
+  if (!InitReg.hasValue())
+    UndefPhis[LoopReg] = R;
+  else
+    Phis[{LoopReg, *InitReg}] = R;
+  return R;
+}
+
+Register KernelRewriter::undef(const TargetRegisterClass *RC) {
+  Register &R = Undefs[RC];
+  if (R == 0) {
+    // Create an IMPLICIT_DEF that defines this register if we need it.
+    // All uses of this should be removed by the time we have finished unrolling
+    // prologs and epilogs.
+    R = MRI.createVirtualRegister(RC);
+    auto *InsertBB = &PreheaderBB->getParent()->front();
+    BuildMI(*InsertBB, InsertBB->getFirstTerminator(), DebugLoc(),
+            TII->get(TargetOpcode::IMPLICIT_DEF), R);
+  }
+  return R;
+}
+
+namespace {
+/// Describes an operand in the kernel of a pipelined loop. Characteristics of
+/// the operand are discovered, such as how many in-loop PHIs it has to jump
+/// through and defaults for these phis.
+class KernelOperandInfo {
+  MachineBasicBlock *BB;
+  MachineRegisterInfo &MRI;
+  SmallVector<Register, 4> PhiDefaults;
+  MachineOperand *Source;
+  MachineOperand *Target;
+
+public:
+  KernelOperandInfo(MachineOperand *MO, MachineRegisterInfo &MRI,
+                    const SmallPtrSetImpl<MachineInstr *> &IllegalPhis)
+      : MRI(MRI) {
+    Source = MO;
+    BB = MO->getParent()->getParent();
+    while (isRegInLoop(MO)) {
+      MachineInstr *MI = MRI.getVRegDef(MO->getReg());
+      if (MI->isFullCopy()) {
+        MO = &MI->getOperand(1);
+        continue;
+      }
+      if (!MI->isPHI())
+        break;
+      // If this is an illegal phi, don't count it in distance.
+      if (IllegalPhis.count(MI)) {
+        MO = &MI->getOperand(3);
+        continue;
+      }
+
+      Register Default = getInitPhiReg(*MI, BB);
+      MO = MI->getOperand(2).getMBB() == BB ? &MI->getOperand(1)
+                                            : &MI->getOperand(3);
+      PhiDefaults.push_back(Default);
+    }
+    Target = MO;
+  }
+
+  bool operator==(const KernelOperandInfo &Other) const {
+    return PhiDefaults.size() == Other.PhiDefaults.size();
+  }
+
+  void print(raw_ostream &OS) const {
+    OS << "use of " << *Source << ": distance(" << PhiDefaults.size() << ") in "
+       << *Source->getParent();
+  }
+
+private:
+  bool isRegInLoop(MachineOperand *MO) {
+    return MO->isReg() && MO->getReg().isVirtual() &&
+           MRI.getVRegDef(MO->getReg())->getParent() == BB;
+  }
+};
+} // namespace
+
+MachineBasicBlock *
+PeelingModuloScheduleExpander::peelKernel(LoopPeelDirection LPD) {
+  MachineBasicBlock *NewBB = PeelSingleBlockLoop(LPD, BB, MRI, TII);
+  if (LPD == LPD_Front)
+    PeeledFront.push_back(NewBB);
+  else
+    PeeledBack.push_front(NewBB);
+  for (auto I = BB->begin(), NI = NewBB->begin(); !I->isTerminator();
+       ++I, ++NI) {
+    CanonicalMIs[&*I] = &*I;
+    CanonicalMIs[&*NI] = &*I;
+    BlockMIs[{NewBB, &*I}] = &*NI;
+    BlockMIs[{BB, &*I}] = &*I;
+  }
+  return NewBB;
+}
+
+void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
+  BitVector LS(Schedule.getNumStages(), true);
+  BitVector AS(Schedule.getNumStages(), true);
+  LiveStages[BB] = LS;
+  AvailableStages[BB] = AS;
+
+  // Peel out the prologs.
+  LS.reset();
+  for (int I = 0; I < Schedule.getNumStages() - 1; ++I) {
+    LS[I] = 1;
+    Prologs.push_back(peelKernel(LPD_Front));
+    LiveStages[Prologs.back()] = LS;
+    AvailableStages[Prologs.back()] = LS;
+  }
+
+  // Create a block that will end up as the new loop exiting block (dominated by
+  // all prologs and epilogs). It will only contain PHIs, in the same order as
+  // BB's PHIs. This gives us a poor-man's LCSSA with the inductive property
+  // that the exiting block is a (sub) clone of BB. This in turn gives us the
+  // property that any value deffed in BB but used outside of BB is used by a
+  // PHI in the exiting block.
+  MachineBasicBlock *ExitingBB = CreateLCSSAExitingBlock();
+
+  // Push out the epilogs, again in reverse order.
+  // We can't assume anything about the minumum loop trip count at this point,
+  // so emit a fairly complex epilog:
+  //  K[0, 1, 2]     // Kernel runs stages 0, 1, 2
+  //  E0[2] <- P1    // Epilog runs stage 2 only, so the state after is [0].
+  //  E1[1, 2] <- P0 // Epilog 1 moves the last item from stage 0 to stage 2.
+  //
+  // This creates a single-successor single-predecessor sequence of blocks for
+  // each epilog, which are kept this way for simplicity at this stage and
+  // cleaned up by the optimizer later.
+  for (int I = 1; I <= Schedule.getNumStages() - 1; ++I) {
+    Epilogs.push_back(nullptr);
+    for (int J = Schedule.getNumStages() - 1; J >= I; --J) {
+      LS.reset();
+      LS[J] = 1;
+      Epilogs.back() = peelKernel(LPD_Back);
+      LiveStages[Epilogs.back()] = LS;
+      AvailableStages[Epilogs.back()] = AS;
+    }
+  }
+
+  // Now we've defined all the prolog and epilog blocks as a fallthrough
+  // sequence, add the edges that will be followed if the loop trip count is
+  // lower than the number of stages (connecting prologs directly with epilogs).
+  auto PI = Prologs.begin();
+  auto EI = Epilogs.begin();
+  assert(Prologs.size() == Epilogs.size());
+  for (; PI != Prologs.end(); ++PI, ++EI) {
+    MachineBasicBlock *Pred = *(*EI)->pred_begin();
+    (*PI)->addSuccessor(*EI);
+    for (MachineInstr &MI : (*EI)->phis()) {
+      Register Reg = MI.getOperand(1).getReg();
+      MachineInstr *Use = MRI.getUniqueVRegDef(Reg);
+      if (Use && Use->getParent() == Pred)
+        Reg = getEquivalentRegisterIn(Reg, *PI);
+      MI.addOperand(MachineOperand::CreateReg(Reg, /*isDef=*/false));
+      MI.addOperand(MachineOperand::CreateMBB(*PI));
+    }
+  }
+
+  // Create a list of all blocks in order.
+  SmallVector<MachineBasicBlock *, 8> Blocks;
+  llvm::copy(PeeledFront, std::back_inserter(Blocks));
+  Blocks.push_back(BB);
+  llvm::copy(PeeledBack, std::back_inserter(Blocks));
+
+  // Iterate in reverse order over all instructions, remapping as we go.
+  for (MachineBasicBlock *B : reverse(Blocks)) {
+    for (auto I = B->getFirstInstrTerminator()->getReverseIterator();
+         I != std::next(B->getFirstNonPHI()->getReverseIterator());) {
+      MachineInstr *MI = &*I++;
+      rewriteUsesOf(MI);
+    }
+  }
+  // Now all remapping has been done, we're free to optimize the generated code.
+  for (MachineBasicBlock *B : reverse(Blocks))
+    EliminateDeadPhis(B, MRI, LIS);
+  EliminateDeadPhis(ExitingBB, MRI, LIS);
+}
+
+MachineBasicBlock *PeelingModuloScheduleExpander::CreateLCSSAExitingBlock() {
+  MachineFunction &MF = *BB->getParent();
+  MachineBasicBlock *Exit = *BB->succ_begin();
+  if (Exit == BB)
+    Exit = *std::next(BB->succ_begin());
+
+  MachineBasicBlock *NewBB = MF.CreateMachineBasicBlock(BB->getBasicBlock());
+  MF.insert(std::next(BB->getIterator()), NewBB);
+
+  // Clone all phis in BB into NewBB and rewrite.
+  for (MachineInstr &MI : BB->phis()) {
+    auto RC = MRI.getRegClass(MI.getOperand(0).getReg());
+    Register OldR = MI.getOperand(3).getReg();
+    Register R = MRI.createVirtualRegister(RC);
+    SmallVector<MachineInstr *, 4> Uses;
+    for (MachineInstr &Use : MRI.use_instructions(OldR))
+      if (Use.getParent() != BB)
+        Uses.push_back(&Use);
+    for (MachineInstr *Use : Uses)
+      Use->substituteRegister(OldR, R, /*SubIdx=*/0,
+                              *MRI.getTargetRegisterInfo());
+    MachineInstr *NI = BuildMI(NewBB, DebugLoc(), TII->get(TargetOpcode::PHI), R)
+        .addReg(OldR)
+        .addMBB(BB);
+    BlockMIs[{NewBB, &MI}] = NI;
+    CanonicalMIs[NI] = &MI;
+  }
+  BB->replaceSuccessor(Exit, NewBB);
+  Exit->replacePhiUsesWith(BB, NewBB);
+  NewBB->addSuccessor(Exit);
+
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  bool CanAnalyzeBr = !TII->analyzeBranch(*BB, TBB, FBB, Cond);
+  (void)CanAnalyzeBr;
+  assert(CanAnalyzeBr && "Must be able to analyze the loop branch!");
+  TII->removeBranch(*BB);
+  TII->insertBranch(*BB, TBB == Exit ? NewBB : TBB, FBB == Exit ? NewBB : FBB,
+                    Cond, DebugLoc());
+  TII->insertUnconditionalBranch(*NewBB, Exit, DebugLoc());
+  return NewBB;
+}
+
+Register
+PeelingModuloScheduleExpander::getEquivalentRegisterIn(Register Reg,
+                                                       MachineBasicBlock *BB) {
+  MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
+  unsigned OpIdx = MI->findRegisterDefOperandIdx(Reg);
+  return BlockMIs[{BB, CanonicalMIs[MI]}]->getOperand(OpIdx).getReg();
+}
+
+void PeelingModuloScheduleExpander::rewriteUsesOf(MachineInstr *MI) {
+  if (MI->isPHI()) {
+    // This is an illegal PHI. The loop-carried (desired) value is operand 3,
+    // and it is produced by this block.
+    Register PhiR = MI->getOperand(0).getReg();
+    Register R = MI->getOperand(3).getReg();
+    int RMIStage = getStage(MRI.getUniqueVRegDef(R));
+    if (RMIStage != -1 && !AvailableStages[MI->getParent()].test(RMIStage))
+      R = MI->getOperand(1).getReg();
+    MRI.setRegClass(R, MRI.getRegClass(PhiR));
+    MRI.replaceRegWith(PhiR, R);
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(*MI);
+    MI->eraseFromParent();
+    return;
+  }
+
+  int Stage = getStage(MI);
+  if (Stage == -1 || LiveStages.count(MI->getParent()) == 0 ||
+      LiveStages[MI->getParent()].test(Stage))
+    // Instruction is live, no rewriting to do.
+    return;
+
+  for (MachineOperand &DefMO : MI->defs()) {
+    SmallVector<std::pair<MachineInstr *, Register>, 4> Subs;
+    for (MachineInstr &UseMI : MRI.use_instructions(DefMO.getReg())) {
+      // Only PHIs can use values from this block by construction.
+      // Match with the equivalent PHI in B.
+      assert(UseMI.isPHI());
+      Register Reg = getEquivalentRegisterIn(UseMI.getOperand(0).getReg(),
+                                             MI->getParent());
+      Subs.emplace_back(&UseMI, Reg);
+    }
+    for (auto &Sub : Subs)
+      Sub.first->substituteRegister(DefMO.getReg(), Sub.second, /*SubIdx=*/0,
+                                    *MRI.getTargetRegisterInfo());
+  }
+  if (LIS)
+    LIS->RemoveMachineInstrFromMaps(*MI);
+  MI->eraseFromParent();
+}
+
+void PeelingModuloScheduleExpander::fixupBranches() {
+  std::unique_ptr<TargetInstrInfo::PipelinerLoopInfo> Info =
+      TII->analyzeLoopForPipelining(BB);
+  assert(Info);
+
+  // Work outwards from the kernel.
+  bool KernelDisposed = false;
+  int TC = Schedule.getNumStages() - 1;
+  for (auto PI = Prologs.rbegin(), EI = Epilogs.rbegin(); PI != Prologs.rend();
+       ++PI, ++EI, --TC) {
+    MachineBasicBlock *Prolog = *PI;
+    MachineBasicBlock *Fallthrough = *Prolog->succ_begin();
+    MachineBasicBlock *Epilog = *EI;
+    SmallVector<MachineOperand, 4> Cond;
+    TII->removeBranch(*Prolog);
+    Optional<bool> StaticallyGreater =
+        Info->createTripCountGreaterCondition(TC, *Prolog, Cond);
+    if (!StaticallyGreater.hasValue()) {
+      LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n");
+      // Dynamically branch based on Cond.
+      TII->insertBranch(*Prolog, Epilog, Fallthrough, Cond, DebugLoc());
+    } else if (*StaticallyGreater == false) {
+      LLVM_DEBUG(dbgs() << "Static-false: TC > " << TC << "\n");
+      // Prolog never falls through; branch to epilog and orphan interior
+      // blocks. Leave it to unreachable-block-elim to clean up.
+      Prolog->removeSuccessor(Fallthrough);
+      for (MachineInstr &P : Fallthrough->phis()) {
+        P.RemoveOperand(2);
+        P.RemoveOperand(1);
+      }
+      TII->insertUnconditionalBranch(*Prolog, Epilog, DebugLoc());
+      KernelDisposed = true;
+    } else {
+      LLVM_DEBUG(dbgs() << "Static-true: TC > " << TC << "\n");
+      // Prolog always falls through; remove incoming values in epilog.
+      Prolog->removeSuccessor(Epilog);
+      for (MachineInstr &P : Epilog->phis()) {
+        P.RemoveOperand(4);
+        P.RemoveOperand(3);
+      }
+    }
+  }
+
+  if (!KernelDisposed) {
+    Info->adjustTripCount(-(Schedule.getNumStages() - 1));
+    Info->setPreheader(Prologs.back());
+  } else {
+    Info->disposed();
+  }
+}
+
+void PeelingModuloScheduleExpander::rewriteKernel() {
+  KernelRewriter KR(*Schedule.getLoop(), Schedule);
+  KR.rewrite();
+}
+
+void PeelingModuloScheduleExpander::expand() {
+  BB = Schedule.getLoop()->getTopBlock();
+  Preheader = Schedule.getLoop()->getLoopPreheader();
+  LLVM_DEBUG(Schedule.dump());
+
+  rewriteKernel();
+  peelPrologAndEpilogs();
+  fixupBranches();
+}
+
+void PeelingModuloScheduleExpander::validateAgainstModuloScheduleExpander() {
+  BB = Schedule.getLoop()->getTopBlock();
+  Preheader = Schedule.getLoop()->getLoopPreheader();
+
+  // Dump the schedule before we invalidate and remap all its instructions.
+  // Stash it in a string so we can print it if we found an error.
+  std::string ScheduleDump;
+  raw_string_ostream OS(ScheduleDump);
+  Schedule.print(OS);
+  OS.flush();
+
+  // First, run the normal ModuleScheduleExpander. We don't support any
+  // InstrChanges.
+  assert(LIS && "Requires LiveIntervals!");
+  ModuloScheduleExpander MSE(MF, Schedule, *LIS,
+                             ModuloScheduleExpander::InstrChangesTy());
+  MSE.expand();
+  MachineBasicBlock *ExpandedKernel = MSE.getRewrittenKernel();
+  if (!ExpandedKernel) {
+    // The expander optimized away the kernel. We can't do any useful checking.
+    MSE.cleanup();
+    return;
+  }
+  // Before running the KernelRewriter, re-add BB into the CFG.
+  Preheader->addSuccessor(BB);
+
+  // Now run the new expansion algorithm.
+  KernelRewriter KR(*Schedule.getLoop(), Schedule);
+  KR.rewrite();
+  peelPrologAndEpilogs();
+
+  // Collect all illegal phis that the new algorithm created. We'll give these
+  // to KernelOperandInfo.
+  SmallPtrSet<MachineInstr *, 4> IllegalPhis;
+  for (auto NI = BB->getFirstNonPHI(); NI != BB->end(); ++NI) {
+    if (NI->isPHI())
+      IllegalPhis.insert(&*NI);
+  }
+
+  // Co-iterate across both kernels. We expect them to be identical apart from
+  // phis and full COPYs (we look through both).
+  SmallVector<std::pair<KernelOperandInfo, KernelOperandInfo>, 8> KOIs;
+  auto OI = ExpandedKernel->begin();
+  auto NI = BB->begin();
+  for (; !OI->isTerminator() && !NI->isTerminator(); ++OI, ++NI) {
+    while (OI->isPHI() || OI->isFullCopy())
+      ++OI;
+    while (NI->isPHI() || NI->isFullCopy())
+      ++NI;
+    assert(OI->getOpcode() == NI->getOpcode() && "Opcodes don't match?!");
+    // Analyze every operand separately.
+    for (auto OOpI = OI->operands_begin(), NOpI = NI->operands_begin();
+         OOpI != OI->operands_end(); ++OOpI, ++NOpI)
+      KOIs.emplace_back(KernelOperandInfo(&*OOpI, MRI, IllegalPhis),
+                        KernelOperandInfo(&*NOpI, MRI, IllegalPhis));
+  }
+
+  bool Failed = false;
+  for (auto &OldAndNew : KOIs) {
+    if (OldAndNew.first == OldAndNew.second)
+      continue;
+    Failed = true;
+    errs() << "Modulo kernel validation error: [\n";
+    errs() << " [golden] ";
+    OldAndNew.first.print(errs());
+    errs() << "          ";
+    OldAndNew.second.print(errs());
+    errs() << "]\n";
+  }
+
+  if (Failed) {
+    errs() << "Golden reference kernel:\n";
+    ExpandedKernel->print(errs());
+    errs() << "New kernel:\n";
+    BB->print(errs());
+    errs() << ScheduleDump;
+    report_fatal_error(
+        "Modulo kernel validation (-pipeliner-experimental-cg) failed");
+  }
+
+  // Cleanup by removing BB from the CFG again as the original
+  // ModuloScheduleExpander intended.
+  Preheader->removeSuccessor(BB);
+  MSE.cleanup();
+}
+
+//===----------------------------------------------------------------------===//
+// ModuloScheduleTestPass implementation
+//===----------------------------------------------------------------------===//
+// This pass constructs a ModuloSchedule from its module and runs
+// ModuloScheduleExpander.
+//
+// The module is expected to contain a single-block analyzable loop.
+// The total order of instructions is taken from the loop as-is.
+// Instructions are expected to be annotated with a PostInstrSymbol.
+// This PostInstrSymbol must have the following format:
+//  "Stage=%d Cycle=%d".
+//===----------------------------------------------------------------------===//
+
+namespace {
+class ModuloScheduleTest : public MachineFunctionPass {
+public:
+  static char ID;
+
+  ModuloScheduleTest() : MachineFunctionPass(ID) {
+    initializeModuloScheduleTestPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void runOnLoop(MachineFunction &MF, MachineLoop &L);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<LiveIntervals>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+} // namespace
+
+char ModuloScheduleTest::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ModuloScheduleTest, "modulo-schedule-test",
+                      "Modulo Schedule test pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(ModuloScheduleTest, "modulo-schedule-test",
+                    "Modulo Schedule test pass", false, false)
+
+bool ModuloScheduleTest::runOnMachineFunction(MachineFunction &MF) {
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  for (auto *L : MLI) {
+    if (L->getTopBlock() != L->getBottomBlock())
+      continue;
+    runOnLoop(MF, *L);
+    return false;
+  }
+  return false;
+}
+
+static void parseSymbolString(StringRef S, int &Cycle, int &Stage) {
+  std::pair<StringRef, StringRef> StageAndCycle = getToken(S, "_");
+  std::pair<StringRef, StringRef> StageTokenAndValue =
+      getToken(StageAndCycle.first, "-");
+  std::pair<StringRef, StringRef> CycleTokenAndValue =
+      getToken(StageAndCycle.second, "-");
+  if (StageTokenAndValue.first != "Stage" ||
+      CycleTokenAndValue.first != "_Cycle") {
+    llvm_unreachable(
+        "Bad post-instr symbol syntax: see comment in ModuloScheduleTest");
+    return;
+  }
+
+  StageTokenAndValue.second.drop_front().getAsInteger(10, Stage);
+  CycleTokenAndValue.second.drop_front().getAsInteger(10, Cycle);
+
+  dbgs() << "  Stage=" << Stage << ", Cycle=" << Cycle << "\n";
+}
+
+void ModuloScheduleTest::runOnLoop(MachineFunction &MF, MachineLoop &L) {
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+  MachineBasicBlock *BB = L.getTopBlock();
+  dbgs() << "--- ModuloScheduleTest running on BB#" << BB->getNumber() << "\n";
+
+  DenseMap<MachineInstr *, int> Cycle, Stage;
+  std::vector<MachineInstr *> Instrs;
+  for (MachineInstr &MI : *BB) {
+    if (MI.isTerminator())
+      continue;
+    Instrs.push_back(&MI);
+    if (MCSymbol *Sym = MI.getPostInstrSymbol()) {
+      dbgs() << "Parsing post-instr symbol for " << MI;
+      parseSymbolString(Sym->getName(), Cycle[&MI], Stage[&MI]);
+    }
+  }
+
+  ModuloSchedule MS(MF, &L, std::move(Instrs), std::move(Cycle),
+                    std::move(Stage));
+  ModuloScheduleExpander MSE(
+      MF, MS, LIS, /*InstrChanges=*/ModuloScheduleExpander::InstrChangesTy());
+  MSE.expand();
+  MSE.cleanup();
+}
+
+//===----------------------------------------------------------------------===//
+// ModuloScheduleTestAnnotater implementation
+//===----------------------------------------------------------------------===//
+
+void ModuloScheduleTestAnnotater::annotate() {
+  for (MachineInstr *MI : S.getInstructions()) {
+    SmallVector<char, 16> SV;
+    raw_svector_ostream OS(SV);
+    OS << "Stage-" << S.getStage(MI) << "_Cycle-" << S.getCycle(MI);
+    MCSymbol *Sym = MF.getContext().getOrCreateSymbol(OS.str());
+    MI->setPostInstrSymbol(MF, Sym);
+  }
+}
diff --git a/lib/CodeGen/OptimizePHIs.cpp b/lib/CodeGen/OptimizePHIs.cpp
index c70b62252139..1a493964e678 100644
--- a/lib/CodeGen/OptimizePHIs.cpp
+++ b/lib/CodeGen/OptimizePHIs.cpp
@@ -97,7 +97,7 @@ bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI,
                                          unsigned &SingleValReg,
                                          InstrSet &PHIsInCycle) {
   assert(MI->isPHI() && "IsSingleValuePHICycle expects a PHI instruction");
-  unsigned DstReg = MI->getOperand(0).getReg();
+  Register DstReg = MI->getOperand(0).getReg();
 
   // See if we already saw this register.
   if (!PHIsInCycle.insert(MI).second)
@@ -109,16 +109,15 @@ bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI,
 
   // Scan the PHI operands.
   for (unsigned i = 1; i != MI->getNumOperands(); i += 2) {
-    unsigned SrcReg = MI->getOperand(i).getReg();
+    Register SrcReg = MI->getOperand(i).getReg();
     if (SrcReg == DstReg)
       continue;
     MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
 
     // Skip over register-to-register moves.
-    if (SrcMI && SrcMI->isCopy() &&
-        !SrcMI->getOperand(0).getSubReg() &&
+    if (SrcMI && SrcMI->isCopy() && !SrcMI->getOperand(0).getSubReg() &&
         !SrcMI->getOperand(1).getSubReg() &&
-        TargetRegisterInfo::isVirtualRegister(SrcMI->getOperand(1).getReg())) {
+        Register::isVirtualRegister(SrcMI->getOperand(1).getReg())) {
       SrcReg = SrcMI->getOperand(1).getReg();
       SrcMI = MRI->getVRegDef(SrcReg);
     }
@@ -142,8 +141,8 @@ bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI,
 /// other PHIs in a cycle.
 bool OptimizePHIs::IsDeadPHICycle(MachineInstr *MI, InstrSet &PHIsInCycle) {
   assert(MI->isPHI() && "IsDeadPHICycle expects a PHI instruction");
-  unsigned DstReg = MI->getOperand(0).getReg();
-  assert(TargetRegisterInfo::isVirtualRegister(DstReg) &&
+  Register DstReg = MI->getOperand(0).getReg();
+  assert(Register::isVirtualRegister(DstReg) &&
          "PHI destination is not a virtual register");
 
   // See if we already saw this register.
@@ -177,7 +176,7 @@ bool OptimizePHIs::OptimizeBB(MachineBasicBlock &MBB) {
     InstrSet PHIsInCycle;
     if (IsSingleValuePHICycle(MI, SingleValReg, PHIsInCycle) &&
         SingleValReg != 0) {
-      unsigned OldReg = MI->getOperand(0).getReg();
+      Register OldReg = MI->getOperand(0).getReg();
       if (!MRI->constrainRegClass(SingleValReg, MRI->getRegClass(OldReg)))
         continue;
 
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index 948a5835438c..4dd4c4b1084e 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -31,7 +31,9 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Pass.h"
@@ -168,7 +170,7 @@ bool PHIElimination::runOnMachineFunction(MachineFunction &MF) {
 
   // Remove dead IMPLICIT_DEF instructions.
   for (MachineInstr *DefMI : ImpDefs) {
-    unsigned DefReg = DefMI->getOperand(0).getReg();
+    Register DefReg = DefMI->getOperand(0).getReg();
     if (MRI->use_nodbg_empty(DefReg)) {
       if (LIS)
         LIS->RemoveMachineInstrFromMaps(*DefMI);
@@ -183,6 +185,11 @@ bool PHIElimination::runOnMachineFunction(MachineFunction &MF) {
     MF.DeleteMachineInstr(I.first);
   }
 
+  // TODO: we should use the incremental DomTree updater here.
+  if (Changed)
+    if (auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>())
+      MDT->getBase().recalculate(MF);
+
   LoweredPHIs.clear();
   ImpDefs.clear();
   VRegPHIUseCount.clear();
@@ -240,7 +247,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
   MachineInstr *MPhi = MBB.remove(&*MBB.begin());
 
   unsigned NumSrcs = (MPhi->getNumOperands() - 1) / 2;
-  unsigned DestReg = MPhi->getOperand(0).getReg();
+  Register DestReg = MPhi->getOperand(0).getReg();
   assert(MPhi->getOperand(0).getSubReg() == 0 && "Can't handle sub-reg PHIs");
   bool isDead = MPhi->getOperand(0).isDead();
 
@@ -252,11 +259,12 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
   // Insert a register to register copy at the top of the current block (but
   // after any remaining phi nodes) which copies the new incoming register
   // into the phi node destination.
+  MachineInstr *PHICopy = nullptr;
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   if (allPhiOperandsUndefined(*MPhi, *MRI))
     // If all sources of a PHI node are implicit_def or undef uses, just emit an
     // implicit_def instead of a copy.
-    BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
+    PHICopy = BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
             TII->get(TargetOpcode::IMPLICIT_DEF), DestReg);
   else {
     // Can we reuse an earlier PHI node? This only happens for critical edges,
@@ -273,15 +281,13 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
       const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(DestReg);
       entry = IncomingReg = MF.getRegInfo().createVirtualRegister(RC);
     }
-    BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
-            TII->get(TargetOpcode::COPY), DestReg)
-      .addReg(IncomingReg);
+    // Give the target possiblity to handle special cases fallthrough otherwise
+    PHICopy = TII->createPHIDestinationCopy(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
+                                  IncomingReg, DestReg);
   }
 
   // Update live variable information if there is any.
   if (LV) {
-    MachineInstr &PHICopy = *std::prev(AfterPHIsIt);
-
     if (IncomingReg) {
       LiveVariables::VarInfo &VI = LV->getVarInfo(IncomingReg);
 
@@ -302,7 +308,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
       // killed.  Note that because the value is defined in several places (once
       // each for each incoming block), the "def" block and instruction fields
       // for the VarInfo is not filled in.
-      LV->addVirtualRegisterKilled(IncomingReg, PHICopy);
+      LV->addVirtualRegisterKilled(IncomingReg, *PHICopy);
     }
 
     // Since we are going to be deleting the PHI node, if it is the last use of
@@ -312,15 +318,14 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
 
     // If the result is dead, update LV.
     if (isDead) {
-      LV->addVirtualRegisterDead(DestReg, PHICopy);
+      LV->addVirtualRegisterDead(DestReg, *PHICopy);
       LV->removeVirtualRegisterDead(DestReg, *MPhi);
     }
   }
 
   // Update LiveIntervals for the new copy or implicit def.
   if (LIS) {
-    SlotIndex DestCopyIndex =
-        LIS->InsertMachineInstrInMaps(*std::prev(AfterPHIsIt));
+    SlotIndex DestCopyIndex = LIS->InsertMachineInstrInMaps(*PHICopy);
 
     SlotIndex MBBStartIndex = LIS->getMBBStartIdx(&MBB);
     if (IncomingReg) {
@@ -368,11 +373,11 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
   // IncomingReg register in the corresponding predecessor basic block.
   SmallPtrSet<MachineBasicBlock*, 8> MBBsInsertedInto;
   for (int i = NumSrcs - 1; i >= 0; --i) {
-    unsigned SrcReg = MPhi->getOperand(i*2+1).getReg();
+    Register SrcReg = MPhi->getOperand(i * 2 + 1).getReg();
     unsigned SrcSubReg = MPhi->getOperand(i*2+1).getSubReg();
     bool SrcUndef = MPhi->getOperand(i*2+1).isUndef() ||
       isImplicitlyDefined(SrcReg, *MRI);
-    assert(TargetRegisterInfo::isVirtualRegister(SrcReg) &&
+    assert(Register::isVirtualRegister(SrcReg) &&
            "Machine PHI Operands must all be virtual registers!");
 
     // Get the MachineBasicBlock equivalent of the BasicBlock that is the source
@@ -406,9 +411,9 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
           if (DefMI->isImplicitDef())
             ImpDefs.insert(DefMI);
       } else {
-        NewSrcInstr = BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
-                            TII->get(TargetOpcode::COPY), IncomingReg)
-                        .addReg(SrcReg, 0, SrcSubReg);
+        NewSrcInstr =
+            TII->createPHISourceCopy(opBlock, InsertPos, MPhi->getDebugLoc(),
+                                     SrcReg, SrcSubReg, IncomingReg);
       }
     }
 
@@ -457,7 +462,7 @@ void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
           }
         } else {
           // We just inserted this copy.
-          KillInst = std::prev(InsertPos);
+          KillInst = NewSrcInstr;
         }
       }
       assert(KillInst->readsRegister(SrcReg) && "Cannot find kill instruction");
@@ -567,7 +572,7 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
   for (MachineBasicBlock::iterator BBI = MBB.begin(), BBE = MBB.end();
        BBI != BBE && BBI->isPHI(); ++BBI) {
     for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2) {
-      unsigned Reg = BBI->getOperand(i).getReg();
+      Register Reg = BBI->getOperand(i).getReg();
       MachineBasicBlock *PreMBB = BBI->getOperand(i+1).getMBB();
       // Is there a critical edge from PreMBB to MBB?
       if (PreMBB->succ_size() == 1)
diff --git a/lib/CodeGen/PatchableFunction.cpp b/lib/CodeGen/PatchableFunction.cpp
index a3fa1b0ad8ed..529fde84e39a 100644
--- a/lib/CodeGen/PatchableFunction.cpp
+++ b/lib/CodeGen/PatchableFunction.cpp
@@ -78,7 +78,7 @@ bool PatchableFunction::runOnMachineFunction(MachineFunction &MF) {
     MIB.add(MO);
 
   FirstActualI->eraseFromParent();
-  MF.ensureAlignment(4);
+  MF.ensureAlignment(Align(16));
   return true;
 }
 
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index b918396aa8c5..54f1d38ed106 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -418,7 +418,7 @@ namespace {
                  const MachineRegisterInfo &MRI,
                  const TargetInstrInfo *TII = nullptr)
         : DefSubReg(DefSubReg), Reg(Reg), MRI(MRI), TII(TII) {
-      if (!TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (!Register::isPhysicalRegister(Reg)) {
         Def = MRI.getVRegDef(Reg);
         DefIdx = MRI.def_begin(Reg).getOperandNo();
       }
@@ -460,8 +460,8 @@ optimizeExtInstr(MachineInstr &MI, MachineBasicBlock &MBB,
   if (!TII->isCoalescableExtInstr(MI, SrcReg, DstReg, SubIdx))
     return false;
 
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg) ||
-      TargetRegisterInfo::isPhysicalRegister(SrcReg))
+  if (Register::isPhysicalRegister(DstReg) ||
+      Register::isPhysicalRegister(SrcReg))
     return false;
 
   if (MRI->hasOneNonDBGUse(SrcReg))
@@ -581,7 +581,7 @@ optimizeExtInstr(MachineInstr &MI, MachineBasicBlock &MBB,
         MRI->constrainRegClass(DstReg, DstRC);
       }
 
-      unsigned NewVR = MRI->createVirtualRegister(RC);
+      Register NewVR = MRI->createVirtualRegister(RC);
       MachineInstr *Copy = BuildMI(*UseMBB, UseMI, UseMI->getDebugLoc(),
                                    TII->get(TargetOpcode::COPY), NewVR)
         .addReg(DstReg, 0, SubIdx);
@@ -609,8 +609,8 @@ bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr &MI) {
   unsigned SrcReg, SrcReg2;
   int CmpMask, CmpValue;
   if (!TII->analyzeCompare(MI, SrcReg, SrcReg2, CmpMask, CmpValue) ||
-      TargetRegisterInfo::isPhysicalRegister(SrcReg) ||
-      (SrcReg2 != 0 && TargetRegisterInfo::isPhysicalRegister(SrcReg2)))
+      Register::isPhysicalRegister(SrcReg) ||
+      (SrcReg2 != 0 && Register::isPhysicalRegister(SrcReg2)))
     return false;
 
   // Attempt to optimize the comparison instruction.
@@ -663,7 +663,7 @@ bool PeepholeOptimizer::findNextSource(RegSubRegPair RegSubReg,
   // Thus, instead of maintaining untested code, we will revisit that if
   // that changes at some point.
   unsigned Reg = RegSubReg.Reg;
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+  if (Register::isPhysicalRegister(Reg))
     return false;
   const TargetRegisterClass *DefRC = MRI->getRegClass(Reg);
 
@@ -675,7 +675,7 @@ bool PeepholeOptimizer::findNextSource(RegSubRegPair RegSubReg,
   do {
     CurSrcPair = SrcToLook.pop_back_val();
     // As explained above, do not handle physical registers
-    if (TargetRegisterInfo::isPhysicalRegister(CurSrcPair.Reg))
+    if (Register::isPhysicalRegister(CurSrcPair.Reg))
       return false;
 
     ValueTracker ValTracker(CurSrcPair.Reg, CurSrcPair.SubReg, *MRI, TII);
@@ -723,7 +723,7 @@ bool PeepholeOptimizer::findNextSource(RegSubRegPair RegSubReg,
       // constraints to the register allocator. Moreover, if we want to extend
       // the live-range of a physical register, unlike SSA virtual register,
       // we will have to check that they aren't redefine before the related use.
-      if (TargetRegisterInfo::isPhysicalRegister(CurSrcPair.Reg))
+      if (Register::isPhysicalRegister(CurSrcPair.Reg))
         return false;
 
       // Keep following the chain if the value isn't any better yet.
@@ -761,7 +761,7 @@ insertPHI(MachineRegisterInfo &MRI, const TargetInstrInfo &TII,
   // NewRC is only correct if no subregisters are involved. findNextSource()
   // should have rejected those cases already.
   assert(SrcRegs[0].SubReg == 0 && "should not have subreg operand");
-  unsigned NewVR = MRI.createVirtualRegister(NewRC);
+  Register NewVR = MRI.createVirtualRegister(NewRC);
   MachineBasicBlock *MBB = OrigPHI.getParent();
   MachineInstrBuilder MIB = BuildMI(*MBB, &OrigPHI, OrigPHI.getDebugLoc(),
                                     TII.get(TargetOpcode::PHI), NewVR);
@@ -1170,7 +1170,7 @@ bool PeepholeOptimizer::optimizeCoalescableCopy(MachineInstr &MI) {
          "Coalescer can understand multiple defs?!");
   const MachineOperand &MODef = MI.getOperand(0);
   // Do not rewrite physical definitions.
-  if (TargetRegisterInfo::isPhysicalRegister(MODef.getReg()))
+  if (Register::isPhysicalRegister(MODef.getReg()))
     return false;
 
   bool Changed = false;
@@ -1221,7 +1221,7 @@ bool PeepholeOptimizer::optimizeCoalescableCopy(MachineInstr &MI) {
 MachineInstr &
 PeepholeOptimizer::rewriteSource(MachineInstr &CopyLike,
                                  RegSubRegPair Def, RewriteMapTy &RewriteMap) {
-  assert(!TargetRegisterInfo::isPhysicalRegister(Def.Reg) &&
+  assert(!Register::isPhysicalRegister(Def.Reg) &&
          "We do not rewrite physical registers");
 
   // Find the new source to use in the COPY rewrite.
@@ -1229,7 +1229,7 @@ PeepholeOptimizer::rewriteSource(MachineInstr &CopyLike,
 
   // Insert the COPY.
   const TargetRegisterClass *DefRC = MRI->getRegClass(Def.Reg);
-  unsigned NewVReg = MRI->createVirtualRegister(DefRC);
+  Register NewVReg = MRI->createVirtualRegister(DefRC);
 
   MachineInstr *NewCopy =
       BuildMI(*CopyLike.getParent(), &CopyLike, CopyLike.getDebugLoc(),
@@ -1280,7 +1280,7 @@ bool PeepholeOptimizer::optimizeUncoalescableCopy(
   while (CpyRewriter.getNextRewritableSource(Src, Def)) {
     // If a physical register is here, this is probably for a good reason.
     // Do not rewrite that.
-    if (TargetRegisterInfo::isPhysicalRegister(Def.Reg))
+    if (Register::isPhysicalRegister(Def.Reg))
       return false;
 
     // If we do not know how to rewrite this definition, there is no point
@@ -1315,12 +1315,11 @@ bool PeepholeOptimizer::isLoadFoldable(
   if (MCID.getNumDefs() != 1)
     return false;
 
-  unsigned Reg = MI.getOperand(0).getReg();
+  Register Reg = MI.getOperand(0).getReg();
   // To reduce compilation time, we check MRI->hasOneNonDBGUser when inserting
   // loads. It should be checked when processing uses of the load, since
   // uses can be removed during peephole.
-  if (!MI.getOperand(0).getSubReg() &&
-      TargetRegisterInfo::isVirtualRegister(Reg) &&
+  if (!MI.getOperand(0).getSubReg() && Register::isVirtualRegister(Reg) &&
       MRI->hasOneNonDBGUser(Reg)) {
     FoldAsLoadDefCandidates.insert(Reg);
     return true;
@@ -1336,8 +1335,8 @@ bool PeepholeOptimizer::isMoveImmediate(
     return false;
   if (MCID.getNumDefs() != 1)
     return false;
-  unsigned Reg = MI.getOperand(0).getReg();
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+  Register Reg = MI.getOperand(0).getReg();
+  if (Register::isVirtualRegister(Reg)) {
     ImmDefMIs.insert(std::make_pair(Reg, &MI));
     ImmDefRegs.insert(Reg);
     return true;
@@ -1359,8 +1358,8 @@ bool PeepholeOptimizer::foldImmediate(MachineInstr &MI,
     // Ignore dead implicit defs.
     if (MO.isImplicit() && MO.isDead())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
       continue;
     if (ImmDefRegs.count(Reg) == 0)
       continue;
@@ -1393,12 +1392,12 @@ bool PeepholeOptimizer::foldRedundantCopy(MachineInstr &MI,
     DenseMap<unsigned, MachineInstr *> &CopyMIs) {
   assert(MI.isCopy() && "expected a COPY machine instruction");
 
-  unsigned SrcReg = MI.getOperand(1).getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+  Register SrcReg = MI.getOperand(1).getReg();
+  if (!Register::isVirtualRegister(SrcReg))
     return false;
 
-  unsigned DstReg = MI.getOperand(0).getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+  Register DstReg = MI.getOperand(0).getReg();
+  if (!Register::isVirtualRegister(DstReg))
     return false;
 
   if (CopySrcRegs.insert(SrcReg).second) {
@@ -1416,7 +1415,7 @@ bool PeepholeOptimizer::foldRedundantCopy(MachineInstr &MI,
   if (SrcSubReg != PrevSrcSubReg)
     return false;
 
-  unsigned PrevDstReg = PrevCopy->getOperand(0).getReg();
+  Register PrevDstReg = PrevCopy->getOperand(0).getReg();
 
   // Only replace if the copy register class is the same.
   //
@@ -1433,8 +1432,7 @@ bool PeepholeOptimizer::foldRedundantCopy(MachineInstr &MI,
 }
 
 bool PeepholeOptimizer::isNAPhysCopy(unsigned Reg) {
-  return TargetRegisterInfo::isPhysicalRegister(Reg) &&
-         !MRI->isAllocatable(Reg);
+  return Register::isPhysicalRegister(Reg) && !MRI->isAllocatable(Reg);
 }
 
 bool PeepholeOptimizer::foldRedundantNAPhysCopy(
@@ -1444,9 +1442,9 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy(
   if (DisableNAPhysCopyOpt)
     return false;
 
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(1).getReg();
-  if (isNAPhysCopy(SrcReg) && TargetRegisterInfo::isVirtualRegister(DstReg)) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  if (isNAPhysCopy(SrcReg) && Register::isVirtualRegister(DstReg)) {
     // %vreg = COPY %physreg
     // Avoid using a datastructure which can track multiple live non-allocatable
     // phys->virt copies since LLVM doesn't seem to do this.
@@ -1454,7 +1452,7 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy(
     return false;
   }
 
-  if (!(TargetRegisterInfo::isVirtualRegister(SrcReg) && isNAPhysCopy(DstReg)))
+  if (!(Register::isVirtualRegister(SrcReg) && isNAPhysCopy(DstReg)))
     return false;
 
   // %physreg = COPY %vreg
@@ -1467,7 +1465,7 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy(
     return false;
   }
 
-  unsigned PrevDstReg = PrevCopy->second->getOperand(0).getReg();
+  Register PrevDstReg = PrevCopy->second->getOperand(0).getReg();
   if (PrevDstReg == SrcReg) {
     // Remove the virt->phys copy: we saw the virtual register definition, and
     // the non-allocatable physical register's state hasn't changed since then.
@@ -1489,7 +1487,7 @@ bool PeepholeOptimizer::foldRedundantNAPhysCopy(
 static bool isVirtualRegisterOperand(MachineOperand &MO) {
   if (!MO.isReg())
     return false;
-  return TargetRegisterInfo::isVirtualRegister(MO.getReg());
+  return Register::isVirtualRegister(MO.getReg());
 }
 
 bool PeepholeOptimizer::findTargetRecurrence(
@@ -1662,7 +1660,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
         for (const MachineOperand &MO : MI->operands()) {
           // Visit all operands: definitions can be implicit or explicit.
           if (MO.isReg()) {
-            unsigned Reg = MO.getReg();
+            Register Reg = MO.getReg();
             if (MO.isDef() && isNAPhysCopy(Reg)) {
               const auto &Def = NAPhysToVirtMIs.find(Reg);
               if (Def != NAPhysToVirtMIs.end()) {
@@ -1778,7 +1776,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
               LocalMIs.erase(DefMI);
               LocalMIs.insert(FoldMI);
               if (MI->isCall())
-                MI->getMF()->updateCallSiteInfo(MI, FoldMI);
+                MI->getMF()->moveCallSiteInfo(MI, FoldMI);
               MI->eraseFromParent();
               DefMI->eraseFromParent();
               MRI->markUsesInDebugValueAsUndef(FoldedReg);
@@ -1810,7 +1808,11 @@ ValueTrackerResult ValueTracker::getNextSourceFromCopy() {
   assert(Def->isCopy() && "Invalid definition");
   // Copy instruction are supposed to be: Def = Src.
   // If someone breaks this assumption, bad things will happen everywhere.
-  assert(Def->getNumOperands() == 2 && "Invalid number of operands");
+  // There may be implicit uses preventing the copy to be moved across
+  // some target specific register definitions
+  assert(Def->getNumOperands() - Def->getNumImplicitOperands() == 2 &&
+         "Invalid number of operands");
+  assert(!Def->hasImplicitDef() && "Only implicit uses are allowed");
 
   if (Def->getOperand(DefIdx).getSubReg() != DefSubReg)
     // If we look for a different subreg, it means we want a subreg of src.
@@ -1855,6 +1857,11 @@ ValueTrackerResult ValueTracker::getNextSourceFromBitcast() {
     SrcIdx = OpIdx;
   }
 
+  // In some rare case, Def has no input, SrcIdx is out of bound,
+  // getOperand(SrcIdx) will fail below.
+  if (SrcIdx >= Def->getNumOperands())
+    return ValueTrackerResult();
+
   // Stop when any user of the bitcast is a SUBREG_TO_REG, replacing with a COPY
   // will break the assumed guarantees for the upper bits.
   for (const MachineInstr &UseMI : MRI.use_nodbg_instructions(DefOp.getReg())) {
@@ -2087,7 +2094,7 @@ ValueTrackerResult ValueTracker::getNextSource() {
 
     // If we can still move up in the use-def chain, move to the next
     // definition.
-    if (!TargetRegisterInfo::isPhysicalRegister(Reg) && OneRegSrc) {
+    if (!Register::isPhysicalRegister(Reg) && OneRegSrc) {
       MachineRegisterInfo::def_iterator DI = MRI.def_begin(Reg);
       if (DI != MRI.def_end()) {
         Def = DI->getParent();
diff --git a/lib/CodeGen/PreISelIntrinsicLowering.cpp b/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 2752e186875c..0d2f6f99ca96 100644
--- a/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -76,7 +76,7 @@ static bool lowerObjCCall(Function &F, const char *NewFn,
   }
 
   for (auto I = F.use_begin(), E = F.use_end(); I != E;) {
-    auto *CI = dyn_cast<CallInst>(I->getUser());
+    auto *CI = cast<CallInst>(I->getUser());
     assert(CI->getCalledFunction() && "Cannot lower an indirect call!");
     ++I;
 
diff --git a/lib/CodeGen/ProcessImplicitDefs.cpp b/lib/CodeGen/ProcessImplicitDefs.cpp
index b38987ad1c90..11bff45f9ad5 100644
--- a/lib/CodeGen/ProcessImplicitDefs.cpp
+++ b/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -73,9 +73,9 @@ bool ProcessImplicitDefs::canTurnIntoImplicitDef(MachineInstr *MI) {
 
 void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
   LLVM_DEBUG(dbgs() << "Processing " << *MI);
-  unsigned Reg = MI->getOperand(0).getReg();
+  Register Reg = MI->getOperand(0).getReg();
 
-  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+  if (Register::isVirtualRegister(Reg)) {
     // For virtual registers, mark all uses as <undef>, and convert users to
     // implicit-def when possible.
     for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
@@ -100,8 +100,8 @@ void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
     for (MachineOperand &MO : UserMI->operands()) {
       if (!MO.isReg())
         continue;
-      unsigned UserReg = MO.getReg();
-      if (!TargetRegisterInfo::isPhysicalRegister(UserReg) ||
+      Register UserReg = MO.getReg();
+      if (!Register::isPhysicalRegister(UserReg) ||
           !TRI->regsOverlap(Reg, UserReg))
         continue;
       // UserMI uses or redefines Reg. Set <undef> flags on all uses.
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index d463bee67595..729f06dda62b 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -898,7 +898,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
   // frame index registers. Functions which don't want/need this optimization
   // will continue to use the existing code path.
   if (MFI.getUseLocalStackAllocationBlock()) {
-    unsigned Align = MFI.getLocalFrameMaxAlign();
+    unsigned Align = MFI.getLocalFrameMaxAlign().value();
 
     // Adjust to alignment boundary.
     Offset = alignTo(Offset, Align, Skew);
diff --git a/lib/CodeGen/PseudoSourceValue.cpp b/lib/CodeGen/PseudoSourceValue.cpp
index da3ef4b771f3..74e721dbd138 100644
--- a/lib/CodeGen/PseudoSourceValue.cpp
+++ b/lib/CodeGen/PseudoSourceValue.cpp
@@ -129,7 +129,7 @@ const PseudoSourceValue *
 PseudoSourceValueManager::getFixedStack(int FI) {
   std::unique_ptr<FixedStackPseudoSourceValue> &V = FSValues[FI];
   if (!V)
-    V = llvm::make_unique<FixedStackPseudoSourceValue>(FI, TII);
+    V = std::make_unique<FixedStackPseudoSourceValue>(FI, TII);
   return V.get();
 }
 
@@ -138,7 +138,7 @@ PseudoSourceValueManager::getGlobalValueCallEntry(const GlobalValue *GV) {
   std::unique_ptr<const GlobalValuePseudoSourceValue> &E =
       GlobalCallEntries[GV];
   if (!E)
-    E = llvm::make_unique<GlobalValuePseudoSourceValue>(GV, TII);
+    E = std::make_unique<GlobalValuePseudoSourceValue>(GV, TII);
   return E.get();
 }
 
@@ -147,6 +147,6 @@ PseudoSourceValueManager::getExternalSymbolCallEntry(const char *ES) {
   std::unique_ptr<const ExternalSymbolPseudoSourceValue> &E =
       ExternalCallEntries[ES];
   if (!E)
-    E = llvm::make_unique<ExternalSymbolPseudoSourceValue>(ES, TII);
+    E = std::make_unique<ExternalSymbolPseudoSourceValue>(ES, TII);
   return E.get();
 }
diff --git a/lib/CodeGen/ReachingDefAnalysis.cpp b/lib/CodeGen/ReachingDefAnalysis.cpp
index f05c97ad621e..2850033e6419 100644
--- a/lib/CodeGen/ReachingDefAnalysis.cpp
+++ b/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -9,6 +9,7 @@
 #include "llvm/CodeGen/ReachingDefAnalysis.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
diff --git a/lib/CodeGen/RegAllocBase.cpp b/lib/CodeGen/RegAllocBase.cpp
index 1cbe75c27d13..156daaa03bb5 100644
--- a/lib/CodeGen/RegAllocBase.cpp
+++ b/lib/CodeGen/RegAllocBase.cpp
@@ -73,7 +73,7 @@ void RegAllocBase::seedLiveRegs() {
   NamedRegionTimer T("seed", "Seed Live Regs", TimerGroupName,
                      TimerGroupDescription, TimePassesIsEnabled);
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     if (MRI->reg_nodbg_empty(Reg))
       continue;
     enqueue(&LIS->getInterval(Reg));
@@ -154,7 +154,7 @@ void RegAllocBase::allocatePhysRegs() {
         continue;
       }
       LLVM_DEBUG(dbgs() << "queuing new interval: " << *SplitVirtReg << "\n");
-      assert(TargetRegisterInfo::isVirtualRegister(SplitVirtReg->reg) &&
+      assert(Register::isVirtualRegister(SplitVirtReg->reg) &&
              "expect split value in virtual register");
       enqueue(SplitVirtReg);
       ++NumNewQueued;
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 2ffa5e389f89..44d0233604e7 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -90,7 +90,7 @@ namespace {
       explicit LiveReg(unsigned VirtReg) : VirtReg(VirtReg) {}
 
       unsigned getSparseSetIndex() const {
-        return TargetRegisterInfo::virtReg2Index(VirtReg);
+        return Register::virtReg2Index(VirtReg);
       }
     };
 
@@ -200,11 +200,11 @@ namespace {
     void assignVirtToPhysReg(LiveReg &, MCPhysReg PhysReg);
 
     LiveRegMap::iterator findLiveVirtReg(unsigned VirtReg) {
-      return LiveVirtRegs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
+      return LiveVirtRegs.find(Register::virtReg2Index(VirtReg));
     }
 
     LiveRegMap::const_iterator findLiveVirtReg(unsigned VirtReg) const {
-      return LiveVirtRegs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
+      return LiveVirtRegs.find(Register::virtReg2Index(VirtReg));
     }
 
     void allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint);
@@ -264,7 +264,7 @@ int RegAllocFast::getStackSpaceFor(unsigned VirtReg) {
 
 /// Returns false if \p VirtReg is known to not live out of the current block.
 bool RegAllocFast::mayLiveOut(unsigned VirtReg) {
-  if (MayLiveAcrossBlocks.test(TargetRegisterInfo::virtReg2Index(VirtReg))) {
+  if (MayLiveAcrossBlocks.test(Register::virtReg2Index(VirtReg))) {
     // Cannot be live-out if there are no successors.
     return !MBB->succ_empty();
   }
@@ -272,7 +272,7 @@ bool RegAllocFast::mayLiveOut(unsigned VirtReg) {
   // If this block loops back to itself, it would be necessary to check whether
   // the use comes after the def.
   if (MBB->isSuccessor(MBB)) {
-    MayLiveAcrossBlocks.set(TargetRegisterInfo::virtReg2Index(VirtReg));
+    MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
     return true;
   }
 
@@ -282,7 +282,7 @@ bool RegAllocFast::mayLiveOut(unsigned VirtReg) {
   unsigned C = 0;
   for (const MachineInstr &UseInst : MRI->reg_nodbg_instructions(VirtReg)) {
     if (UseInst.getParent() != MBB || ++C >= Limit) {
-      MayLiveAcrossBlocks.set(TargetRegisterInfo::virtReg2Index(VirtReg));
+      MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
       // Cannot be live-out if there are no successors.
       return !MBB->succ_empty();
     }
@@ -293,7 +293,7 @@ bool RegAllocFast::mayLiveOut(unsigned VirtReg) {
 
 /// Returns false if \p VirtReg is known to not be live into the current block.
 bool RegAllocFast::mayLiveIn(unsigned VirtReg) {
-  if (MayLiveAcrossBlocks.test(TargetRegisterInfo::virtReg2Index(VirtReg)))
+  if (MayLiveAcrossBlocks.test(Register::virtReg2Index(VirtReg)))
     return !MBB->pred_empty();
 
   // See if the first \p Limit def of the register are all in the current block.
@@ -301,7 +301,7 @@ bool RegAllocFast::mayLiveIn(unsigned VirtReg) {
   unsigned C = 0;
   for (const MachineInstr &DefInst : MRI->def_instructions(VirtReg)) {
     if (DefInst.getParent() != MBB || ++C >= Limit) {
-      MayLiveAcrossBlocks.set(TargetRegisterInfo::virtReg2Index(VirtReg));
+      MayLiveAcrossBlocks.set(Register::virtReg2Index(VirtReg));
       return !MBB->pred_empty();
     }
   }
@@ -394,7 +394,7 @@ void RegAllocFast::killVirtReg(LiveReg &LR) {
 
 /// Mark virtreg as no longer available.
 void RegAllocFast::killVirtReg(unsigned VirtReg) {
-  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+  assert(Register::isVirtualRegister(VirtReg) &&
          "killVirtReg needs a virtual register");
   LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
   if (LRI != LiveVirtRegs.end() && LRI->PhysReg)
@@ -405,7 +405,7 @@ void RegAllocFast::killVirtReg(unsigned VirtReg) {
 /// stack slot if needed.
 void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI,
                                 unsigned VirtReg) {
-  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+  assert(Register::isVirtualRegister(VirtReg) &&
          "Spilling a physical register is illegal!");
   LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
   assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
@@ -455,9 +455,8 @@ void RegAllocFast::usePhysReg(MachineOperand &MO) {
   if (MO.isUndef())
     return;
 
-  unsigned PhysReg = MO.getReg();
-  assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
-         "Bad usePhysReg operand");
+  Register PhysReg = MO.getReg();
+  assert(Register::isPhysicalRegister(PhysReg) && "Bad usePhysReg operand");
 
   markRegUsedInInstr(PhysReg);
   switch (PhysRegState[PhysReg]) {
@@ -626,9 +625,9 @@ unsigned RegAllocFast::traceCopyChain(unsigned Reg) const {
   static const unsigned ChainLengthLimit = 3;
   unsigned C = 0;
   do {
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    if (Register::isPhysicalRegister(Reg))
       return Reg;
-    assert(TargetRegisterInfo::isVirtualRegister(Reg));
+    assert(Register::isVirtualRegister(Reg));
 
     MachineInstr *VRegDef = MRI->getUniqueVRegDef(Reg);
     if (!VRegDef || !isCoalescable(*VRegDef))
@@ -646,7 +645,7 @@ unsigned RegAllocFast::traceCopies(unsigned VirtReg) const {
   unsigned C = 0;
   for (const MachineInstr &MI : MRI->def_instructions(VirtReg)) {
     if (isCoalescable(MI)) {
-      unsigned Reg = MI.getOperand(1).getReg();
+      Register Reg = MI.getOperand(1).getReg();
       Reg = traceCopyChain(Reg);
       if (Reg != 0)
         return Reg;
@@ -662,7 +661,7 @@ unsigned RegAllocFast::traceCopies(unsigned VirtReg) const {
 void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint0) {
   const unsigned VirtReg = LR.VirtReg;
 
-  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+  assert(Register::isVirtualRegister(VirtReg) &&
          "Can only allocate virtual registers");
 
   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
@@ -671,8 +670,8 @@ void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint0) {
                     << " with hint " << printReg(Hint0, TRI) << '\n');
 
   // Take hint when possible.
-  if (TargetRegisterInfo::isPhysicalRegister(Hint0) &&
-      MRI->isAllocatable(Hint0) && RC.contains(Hint0)) {
+  if (Register::isPhysicalRegister(Hint0) && MRI->isAllocatable(Hint0) &&
+      RC.contains(Hint0)) {
     // Ignore the hint if we would have to spill a dirty register.
     unsigned Cost = calcSpillCost(Hint0);
     if (Cost < spillDirty) {
@@ -692,9 +691,8 @@ void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint0) {
 
   // Try other hint.
   unsigned Hint1 = traceCopies(VirtReg);
-  if (TargetRegisterInfo::isPhysicalRegister(Hint1) &&
-      MRI->isAllocatable(Hint1) && RC.contains(Hint1) &&
-      !isRegUsedInInstr(Hint1)) {
+  if (Register::isPhysicalRegister(Hint1) && MRI->isAllocatable(Hint1) &&
+      RC.contains(Hint1) && !isRegUsedInInstr(Hint1)) {
     // Ignore the hint if we would have to spill a dirty register.
     unsigned Cost = calcSpillCost(Hint1);
     if (Cost < spillDirty) {
@@ -752,8 +750,8 @@ void RegAllocFast::allocVirtReg(MachineInstr &MI, LiveReg &LR, unsigned Hint0) {
 
 void RegAllocFast::allocVirtRegUndef(MachineOperand &MO) {
   assert(MO.isUndef() && "expected undef use");
-  unsigned VirtReg = MO.getReg();
-  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) && "Expected virtreg");
+  Register VirtReg = MO.getReg();
+  assert(Register::isVirtualRegister(VirtReg) && "Expected virtreg");
 
   LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg);
   MCPhysReg PhysReg;
@@ -778,14 +776,13 @@ void RegAllocFast::allocVirtRegUndef(MachineOperand &MO) {
 /// Allocates a register for VirtReg and mark it as dirty.
 MCPhysReg RegAllocFast::defineVirtReg(MachineInstr &MI, unsigned OpNum,
                                       unsigned VirtReg, unsigned Hint) {
-  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
-         "Not a virtual register");
+  assert(Register::isVirtualRegister(VirtReg) && "Not a virtual register");
   LiveRegMap::iterator LRI;
   bool New;
   std::tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
   if (!LRI->PhysReg) {
     // If there is no hint, peek at the only use of this register.
-    if ((!Hint || !TargetRegisterInfo::isPhysicalRegister(Hint)) &&
+    if ((!Hint || !Register::isPhysicalRegister(Hint)) &&
         MRI->hasOneNonDBGUse(VirtReg)) {
       const MachineInstr &UseMI = *MRI->use_instr_nodbg_begin(VirtReg);
       // It's a copy, use the destination register as a hint.
@@ -812,8 +809,7 @@ RegAllocFast::LiveReg &RegAllocFast::reloadVirtReg(MachineInstr &MI,
                                                    unsigned OpNum,
                                                    unsigned VirtReg,
                                                    unsigned Hint) {
-  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
-         "Not a virtual register");
+  assert(Register::isVirtualRegister(VirtReg) && "Not a virtual register");
   LiveRegMap::iterator LRI;
   bool New;
   std::tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
@@ -866,7 +862,7 @@ bool RegAllocFast::setPhysReg(MachineInstr &MI, MachineOperand &MO,
   }
 
   // Handle subregister index.
-  MO.setReg(PhysReg ? TRI->getSubReg(PhysReg, MO.getSubReg()) : 0);
+  MO.setReg(PhysReg ? TRI->getSubReg(PhysReg, MO.getSubReg()) : Register());
   MO.setIsRenamable(true);
   MO.setSubReg(0);
 
@@ -893,8 +889,8 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
   SmallSet<unsigned, 8> ThroughRegs;
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
       continue;
     if (MO.isEarlyClobber() || (MO.isUse() && MO.isTied()) ||
         (MO.getSubReg() && MI.readsVirtualRegister(Reg))) {
@@ -908,8 +904,9 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
   LLVM_DEBUG(dbgs() << "\nChecking for physdef collisions.\n");
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || !MO.isDef()) continue;
-    unsigned Reg = MO.getReg();
-    if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    Register Reg = MO.getReg();
+    if (!Reg || !Register::isPhysicalRegister(Reg))
+      continue;
     markRegUsedInInstr(Reg);
     for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
       if (ThroughRegs.count(PhysRegState[*AI]))
@@ -922,8 +919,9 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
     MachineOperand &MO = MI.getOperand(I);
     if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
+      continue;
     if (MO.isUse()) {
       if (!MO.isTied()) continue;
       LLVM_DEBUG(dbgs() << "Operand " << I << "(" << MO
@@ -947,8 +945,9 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
   for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
     const MachineOperand &MO = MI.getOperand(I);
     if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
+      continue;
     if (!MO.isEarlyClobber())
       continue;
     // Note: defineVirtReg may invalidate MO.
@@ -961,8 +960,9 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
   UsedInInstr.clear();
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || (MO.isDef() && !MO.isEarlyClobber())) continue;
-    unsigned Reg = MO.getReg();
-    if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    Register Reg = MO.getReg();
+    if (!Reg || !Register::isPhysicalRegister(Reg))
+      continue;
     LLVM_DEBUG(dbgs() << "\tSetting " << printReg(Reg, TRI)
                       << " as used in instr\n");
     markRegUsedInInstr(Reg);
@@ -1002,10 +1002,8 @@ void RegAllocFast::dumpState() {
        e = LiveVirtRegs.end(); i != e; ++i) {
     if (!i->PhysReg)
       continue;
-    assert(TargetRegisterInfo::isVirtualRegister(i->VirtReg) &&
-           "Bad map key");
-    assert(TargetRegisterInfo::isPhysicalRegister(i->PhysReg) &&
-           "Bad map value");
+    assert(Register::isVirtualRegister(i->VirtReg) && "Bad map key");
+    assert(Register::isPhysicalRegister(i->PhysReg) && "Bad map value");
     assert(PhysRegState[i->PhysReg] == i->VirtReg && "Bad inverse map");
   }
 }
@@ -1045,9 +1043,9 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
       continue;
     }
     if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg) continue;
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (Register::isVirtualRegister(Reg)) {
       VirtOpEnd = i+1;
       if (MO.isUse()) {
         hasTiedOps = hasTiedOps ||
@@ -1096,8 +1094,9 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
   for (unsigned I = 0; I != VirtOpEnd; ++I) {
     MachineOperand &MO = MI.getOperand(I);
     if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
+    Register Reg = MO.getReg();
+    if (!Register::isVirtualRegister(Reg))
+      continue;
     if (MO.isUse()) {
       if (MO.isUndef()) {
         HasUndefUse = true;
@@ -1124,8 +1123,8 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
     for (MachineOperand &MO : MI.uses()) {
       if (!MO.isReg() || !MO.isUse())
         continue;
-      unsigned Reg = MO.getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      Register Reg = MO.getReg();
+      if (!Register::isVirtualRegister(Reg))
         continue;
 
       assert(MO.isUndef() && "Should only have undef virtreg uses left");
@@ -1139,8 +1138,9 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
   if (hasEarlyClobbers) {
     for (const MachineOperand &MO : MI.operands()) {
       if (!MO.isReg()) continue;
-      unsigned Reg = MO.getReg();
-      if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+      Register Reg = MO.getReg();
+      if (!Reg || !Register::isPhysicalRegister(Reg))
+        continue;
       // Look for physreg defs and tied uses.
       if (!MO.isDef() && !MO.isTied()) continue;
       markRegUsedInInstr(Reg);
@@ -1166,10 +1166,9 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
     const MachineOperand &MO = MI.getOperand(I);
     if (!MO.isReg() || !MO.isDef() || !MO.getReg() || MO.isEarlyClobber())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
 
-    if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg) ||
-        !MRI->isAllocatable(Reg))
+    if (!Reg || !Register::isPhysicalRegister(Reg) || !MRI->isAllocatable(Reg))
       continue;
     definePhysReg(MI, Reg, MO.isDead() ? regFree : regReserved);
   }
@@ -1180,10 +1179,10 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
     const MachineOperand &MO = MI.getOperand(I);
     if (!MO.isReg() || !MO.isDef() || !MO.getReg() || MO.isEarlyClobber())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
 
     // We have already dealt with phys regs in the previous scan.
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    if (Register::isPhysicalRegister(Reg))
       continue;
     MCPhysReg PhysReg = defineVirtReg(MI, I, Reg, CopySrcReg);
     if (setPhysReg(MI, MI.getOperand(I), PhysReg)) {
@@ -1215,8 +1214,8 @@ void RegAllocFast::handleDebugValue(MachineInstr &MI) {
   // mostly constants and frame indices.
   if (!MO.isReg())
     return;
-  unsigned Reg = MO.getReg();
-  if (!TargetRegisterInfo::isVirtualRegister(Reg))
+  Register Reg = MO.getReg();
+  if (!Register::isVirtualRegister(Reg))
     return;
 
   // See if this virtual register has already been allocated to a physical
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 771fc46415db..d27db678f02a 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -685,7 +685,7 @@ void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
   // The queue holds (size, reg) pairs.
   const unsigned Size = LI->getSize();
   const unsigned Reg = LI->reg;
-  assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
+  assert(Register::isVirtualRegister(Reg) &&
          "Can only enqueue virtual registers");
   unsigned Prio;
 
@@ -899,7 +899,7 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
     // Check if any interfering live range is heavier than MaxWeight.
     for (unsigned i = Q.interferingVRegs().size(); i; --i) {
       LiveInterval *Intf = Q.interferingVRegs()[i - 1];
-      assert(TargetRegisterInfo::isVirtualRegister(Intf->reg) &&
+      assert(Register::isVirtualRegister(Intf->reg) &&
              "Only expecting virtual register interference from query");
 
       // Do not allow eviction of a virtual register if we are in the middle
@@ -984,7 +984,7 @@ bool RAGreedy::canEvictInterferenceInRange(LiveInterval &VirtReg,
         continue;
 
       // Cannot evict non virtual reg interference.
-      if (!TargetRegisterInfo::isVirtualRegister(Intf->reg))
+      if (!Register::isVirtualRegister(Intf->reg))
         return false;
       // Never evict spill products. They cannot split or spill.
       if (getStage(*Intf) == RS_Done)
@@ -2881,7 +2881,7 @@ void RAGreedy::collectHintInfo(unsigned Reg, HintsInfo &Out) {
         continue;
     }
     // Get the current assignment.
-    Register OtherPhysReg = TargetRegisterInfo::isPhysicalRegister(OtherReg)
+    Register OtherPhysReg = Register::isPhysicalRegister(OtherReg)
                                 ? OtherReg
                                 : VRM->getPhys(OtherReg);
     // Push the collected information.
@@ -2919,7 +2919,7 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
   SmallVector<unsigned, 2> RecoloringCandidates;
   HintsInfo Info;
   unsigned Reg = VirtReg.reg;
-  unsigned PhysReg = VRM->getPhys(Reg);
+  Register PhysReg = VRM->getPhys(Reg);
   // Start the recoloring algorithm from the input live-interval, then
   // it will propagate to the ones that are copy-related with it.
   Visited.insert(Reg);
@@ -2932,7 +2932,7 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
     Reg = RecoloringCandidates.pop_back_val();
 
     // We cannot recolor physical register.
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    if (Register::isPhysicalRegister(Reg))
       continue;
 
     assert(VRM->hasPhys(Reg) && "We have unallocated variable!!");
@@ -2940,7 +2940,7 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
     // Get the live interval mapped with this virtual register to be able
     // to check for the interference with the new color.
     LiveInterval &LI = LIS->getInterval(Reg);
-    unsigned CurrPhys = VRM->getPhys(Reg);
+    Register CurrPhys = VRM->getPhys(Reg);
     // Check that the new color matches the register class constraints and
     // that it is free for this live range.
     if (CurrPhys != PhysReg && (!MRI->getRegClass(Reg)->contains(PhysReg) ||
@@ -3021,7 +3021,7 @@ void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
 /// getting rid of 2 copies.
 void RAGreedy::tryHintsRecoloring() {
   for (LiveInterval *LI : SetOfBrokenHints) {
-    assert(TargetRegisterInfo::isVirtualRegister(LI->reg) &&
+    assert(Register::isVirtualRegister(LI->reg) &&
            "Recoloring is possible only for virtual registers");
     // Some dead defs may be around (e.g., because of debug uses).
     // Ignore those.
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index 7a5a6c148ed4..3c4a46b12f99 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -558,7 +558,7 @@ void RegAllocPBQP::findVRegIntervalsToAlloc(const MachineFunction &MF,
 
   // Iterate over all live ranges.
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    unsigned Reg = Register::index2VirtReg(I);
     if (MRI.reg_nodbg_empty(Reg))
       continue;
     VRegsToAlloc.insert(Reg);
@@ -824,11 +824,11 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
   if (!VRegsToAlloc.empty()) {
     const TargetSubtargetInfo &Subtarget = MF.getSubtarget();
     std::unique_ptr<PBQPRAConstraintList> ConstraintsRoot =
-      llvm::make_unique<PBQPRAConstraintList>();
-    ConstraintsRoot->addConstraint(llvm::make_unique<SpillCosts>());
-    ConstraintsRoot->addConstraint(llvm::make_unique<Interference>());
+      std::make_unique<PBQPRAConstraintList>();
+    ConstraintsRoot->addConstraint(std::make_unique<SpillCosts>());
+    ConstraintsRoot->addConstraint(std::make_unique<Interference>());
     if (PBQPCoalescing)
-      ConstraintsRoot->addConstraint(llvm::make_unique<Coalescing>());
+      ConstraintsRoot->addConstraint(std::make_unique<Coalescing>());
     ConstraintsRoot->addConstraint(Subtarget.getCustomPBQPConstraints());
 
     bool PBQPAllocComplete = false;
@@ -848,7 +848,7 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
         std::string GraphFileName = FullyQualifiedName + "." + RS.str() +
                                     ".pbqpgraph";
         std::error_code EC;
-        raw_fd_ostream OS(GraphFileName, EC, sys::fs::F_Text);
+        raw_fd_ostream OS(GraphFileName, EC, sys::fs::OF_Text);
         LLVM_DEBUG(dbgs() << "Dumping graph for round " << Round << " to \""
                           << GraphFileName << "\"\n");
         G.dump(OS);
diff --git a/lib/CodeGen/RegUsageInfoCollector.cpp b/lib/CodeGen/RegUsageInfoCollector.cpp
index b37dfada7101..757ff0e44953 100644
--- a/lib/CodeGen/RegUsageInfoCollector.cpp
+++ b/lib/CodeGen/RegUsageInfoCollector.cpp
@@ -142,6 +142,13 @@ bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) {
   auto SetRegAsDefined = [&RegMask] (unsigned Reg) {
     RegMask[Reg / 32] &= ~(1u << Reg % 32);
   };
+
+  // Some targets can clobber registers "inside" a call, typically in
+  // linker-generated code.
+  for (const MCPhysReg Reg : TRI->getIntraCallClobberedRegs(&MF))
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+      SetRegAsDefined(*AI);
+
   // Scan all the physical registers. When a register is defined in the current
   // function set it and all the aliasing registers as defined in the regmask.
   // FIXME: Rewrite to use regunits.
@@ -164,7 +171,8 @@ bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) {
       SetRegAsDefined(PReg);
   }
 
-  if (TargetFrameLowering::isSafeForNoCSROpt(F)) {
+  if (TargetFrameLowering::isSafeForNoCSROpt(F) &&
+      MF.getSubtarget().getFrameLowering()->isProfitableForNoCSROpt(F)) {
     ++NumCSROpt;
     LLVM_DEBUG(dbgs() << MF.getName()
                       << " function optimized for not having CSR.\n");
diff --git a/lib/CodeGen/RegUsageInfoPropagate.cpp b/lib/CodeGen/RegUsageInfoPropagate.cpp
index fc4be82d215e..0205e6193741 100644
--- a/lib/CodeGen/RegUsageInfoPropagate.cpp
+++ b/lib/CodeGen/RegUsageInfoPropagate.cpp
@@ -130,7 +130,11 @@ bool RegUsageInfoPropagation::runOnMachineFunction(MachineFunction &MF) {
       };
 
       if (const Function *F = findCalledFunction(M, MI)) {
-        UpdateRegMask(*F);
+        if (F->isDefinitionExact()) {
+          UpdateRegMask(*F);
+        } else {
+          LLVM_DEBUG(dbgs() << "Function definition is not exact\n");
+        }
       } else {
         LLVM_DEBUG(dbgs() << "Failed to find call target function\n");
       }
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 2db6ab454cea..6ff5ddbc023d 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -406,8 +406,8 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
   Partial = SrcSub || DstSub;
 
   // If one register is a physreg, it must be Dst.
-  if (TargetRegisterInfo::isPhysicalRegister(Src)) {
-    if (TargetRegisterInfo::isPhysicalRegister(Dst))
+  if (Register::isPhysicalRegister(Src)) {
+    if (Register::isPhysicalRegister(Dst))
       return false;
     std::swap(Src, Dst);
     std::swap(SrcSub, DstSub);
@@ -416,7 +416,7 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
 
   const MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
 
-  if (TargetRegisterInfo::isPhysicalRegister(Dst)) {
+  if (Register::isPhysicalRegister(Dst)) {
     // Eliminate DstSub on a physreg.
     if (DstSub) {
       Dst = TRI.getSubReg(Dst, DstSub);
@@ -474,8 +474,8 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
     CrossClass = NewRC != DstRC || NewRC != SrcRC;
   }
   // Check our invariants
-  assert(TargetRegisterInfo::isVirtualRegister(Src) && "Src must be virtual");
-  assert(!(TargetRegisterInfo::isPhysicalRegister(Dst) && DstSub) &&
+  assert(Register::isVirtualRegister(Src) && "Src must be virtual");
+  assert(!(Register::isPhysicalRegister(Dst) && DstSub) &&
          "Cannot have a physical SubIdx");
   SrcReg = Src;
   DstReg = Dst;
@@ -483,7 +483,7 @@ bool CoalescerPair::setRegisters(const MachineInstr *MI) {
 }
 
 bool CoalescerPair::flip() {
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+  if (Register::isPhysicalRegister(DstReg))
     return false;
   std::swap(SrcReg, DstReg);
   std::swap(SrcIdx, DstIdx);
@@ -507,8 +507,8 @@ bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
   }
 
   // Now check that Dst matches DstReg.
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
-    if (!TargetRegisterInfo::isPhysicalRegister(Dst))
+  if (Register::isPhysicalRegister(DstReg)) {
+    if (!Register::isPhysicalRegister(Dst))
       return false;
     assert(!DstIdx && !SrcIdx && "Inconsistent CoalescerPair state.");
     // DstSub could be set for a physreg from INSERT_SUBREG.
@@ -802,7 +802,7 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
     return { false, false };
 
   MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
-  unsigned NewReg = NewDstMO.getReg();
+  Register NewReg = NewDstMO.getReg();
   if (NewReg != IntB.reg || !IntB.Query(AValNo->def).isKill())
     return { false, false };
 
@@ -835,8 +835,8 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
       TII->commuteInstruction(*DefMI, false, UseOpIdx, NewDstIdx);
   if (!NewMI)
     return { false, false };
-  if (TargetRegisterInfo::isVirtualRegister(IntA.reg) &&
-      TargetRegisterInfo::isVirtualRegister(IntB.reg) &&
+  if (Register::isVirtualRegister(IntA.reg) &&
+      Register::isVirtualRegister(IntB.reg) &&
       !MRI->constrainRegClass(IntB.reg, MRI->getRegClass(IntA.reg)))
     return { false, false };
   if (NewMI != DefMI) {
@@ -877,7 +877,7 @@ RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
       continue;
     // Kill flags are no longer accurate. They are recomputed after RA.
     UseMO.setIsKill(false);
-    if (TargetRegisterInfo::isPhysicalRegister(NewReg))
+    if (Register::isPhysicalRegister(NewReg))
       UseMO.substPhysReg(NewReg, *TRI);
     else
       UseMO.setReg(NewReg);
@@ -1188,7 +1188,7 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
 /// Returns true if @p MI defines the full vreg @p Reg, as opposed to just
 /// defining a subregister.
 static bool definesFullReg(const MachineInstr &MI, unsigned Reg) {
-  assert(!TargetRegisterInfo::isPhysicalRegister(Reg) &&
+  assert(!Register::isPhysicalRegister(Reg) &&
          "This code cannot handle physreg aliasing");
   for (const MachineOperand &Op : MI.operands()) {
     if (!Op.isReg() || !Op.isDef() || Op.getReg() != Reg)
@@ -1209,7 +1209,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   unsigned SrcIdx = CP.isFlipped() ? CP.getDstIdx() : CP.getSrcIdx();
   unsigned DstReg = CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg();
   unsigned DstIdx = CP.isFlipped() ? CP.getSrcIdx() : CP.getDstIdx();
-  if (TargetRegisterInfo::isPhysicalRegister(SrcReg))
+  if (Register::isPhysicalRegister(SrcReg))
     return false;
 
   LiveInterval &SrcInt = LIS->getInterval(SrcReg);
@@ -1240,7 +1240,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     return false;
   // Only support subregister destinations when the def is read-undef.
   MachineOperand &DstOperand = CopyMI->getOperand(0);
-  unsigned CopyDstReg = DstOperand.getReg();
+  Register CopyDstReg = DstOperand.getReg();
   if (DstOperand.getSubReg() && !DstOperand.isUndef())
     return false;
 
@@ -1254,7 +1254,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
 
   const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI, *MF);
   if (!DefMI->isImplicitDef()) {
-    if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
+    if (Register::isPhysicalRegister(DstReg)) {
       unsigned NewDstReg = DstReg;
 
       unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(),
@@ -1269,7 +1269,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     } else {
       // Theoretically, some stack frame reference could exist. Just make sure
       // it hasn't actually happened.
-      assert(TargetRegisterInfo::isVirtualRegister(DstReg) &&
+      assert(Register::isVirtualRegister(DstReg) &&
              "Only expect to deal with virtual or physical registers");
     }
   }
@@ -1317,7 +1317,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     if (MO.isReg()) {
       assert(MO.isImplicit() && "No explicit operands after implicit operands.");
       // Discard VReg implicit defs.
-      if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+      if (Register::isPhysicalRegister(MO.getReg()))
         ImplicitOps.push_back(MO);
     }
   }
@@ -1336,12 +1336,12 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     MachineOperand &MO = NewMI.getOperand(i);
     if (MO.isReg() && MO.isDef()) {
       assert(MO.isImplicit() && MO.isDead() &&
-             TargetRegisterInfo::isPhysicalRegister(MO.getReg()));
+             Register::isPhysicalRegister(MO.getReg()));
       NewMIImplDefs.push_back(MO.getReg());
     }
   }
 
-  if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
+  if (Register::isVirtualRegister(DstReg)) {
     unsigned NewIdx = NewMI.getOperand(0).getSubReg();
 
     if (DefRC != nullptr) {
@@ -1428,7 +1428,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
   } else if (NewMI.getOperand(0).getReg() != CopyDstReg) {
     // The New instruction may be defining a sub-register of what's actually
     // been asked for. If so it must implicitly define the whole thing.
-    assert(TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+    assert(Register::isPhysicalRegister(DstReg) &&
            "Only expect virtual or physical registers in remat");
     NewMI.getOperand(0).setIsDead(true);
     NewMI.addOperand(MachineOperand::CreateReg(
@@ -1480,7 +1480,7 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
     for (MachineOperand &UseMO : MRI->use_operands(SrcReg)) {
       MachineInstr *UseMI = UseMO.getParent();
       if (UseMI->isDebugValue()) {
-        if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+        if (Register::isPhysicalRegister(DstReg))
           UseMO.substPhysReg(DstReg, *TRI);
         else
           UseMO.setReg(DstReg);
@@ -1651,7 +1651,7 @@ void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
 void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
                                           unsigned DstReg,
                                           unsigned SubIdx) {
-  bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+  bool DstIsPhys = Register::isPhysicalRegister(DstReg);
   LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg);
 
   if (DstInt && DstInt->hasSubRanges() && DstReg != SrcReg) {
@@ -2411,8 +2411,8 @@ std::pair<const VNInfo*, unsigned> JoinVals::followCopyChain(
     assert(MI && "No defining instruction");
     if (!MI->isFullCopy())
       return std::make_pair(VNI, TrackReg);
-    unsigned SrcReg = MI->getOperand(1).getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+    Register SrcReg = MI->getOperand(1).getReg();
+    if (!Register::isVirtualRegister(SrcReg))
       return std::make_pair(VNI, TrackReg);
 
     const LiveInterval &LI = LIS->getInterval(SrcReg);
@@ -3189,9 +3189,9 @@ void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
       MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
       assert(MI && "No instruction to erase");
       if (MI->isCopy()) {
-        unsigned Reg = MI->getOperand(1).getReg();
-        if (TargetRegisterInfo::isVirtualRegister(Reg) &&
-            Reg != CP.getSrcReg() && Reg != CP.getDstReg())
+        Register Reg = MI->getOperand(1).getReg();
+        if (Register::isVirtualRegister(Reg) && Reg != CP.getSrcReg() &&
+            Reg != CP.getDstReg())
           ShrinkRegs.push_back(Reg);
       }
       ErasedInstrs.insert(MI);
@@ -3463,10 +3463,10 @@ static bool isLocalCopy(MachineInstr *Copy, const LiveIntervals *LIS) {
   if (Copy->getOperand(1).isUndef())
     return false;
 
-  unsigned SrcReg = Copy->getOperand(1).getReg();
-  unsigned DstReg = Copy->getOperand(0).getReg();
-  if (TargetRegisterInfo::isPhysicalRegister(SrcReg)
-      || TargetRegisterInfo::isPhysicalRegister(DstReg))
+  Register SrcReg = Copy->getOperand(1).getReg();
+  Register DstReg = Copy->getOperand(0).getReg();
+  if (Register::isPhysicalRegister(SrcReg) ||
+      Register::isPhysicalRegister(DstReg))
     return false;
 
   return LIS->intervalIsInOneMBB(LIS->getInterval(SrcReg))
@@ -3526,12 +3526,11 @@ bool RegisterCoalescer::applyTerminalRule(const MachineInstr &Copy) const {
   if (!isMoveInstr(*TRI, &Copy, SrcReg, DstReg, SrcSubReg, DstSubReg))
     return false;
   // Check if the destination of this copy has any other affinity.
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg) ||
+  if (Register::isPhysicalRegister(DstReg) ||
       // If SrcReg is a physical register, the copy won't be coalesced.
       // Ignoring it may have other side effect (like missing
       // rematerialization). So keep it.
-      TargetRegisterInfo::isPhysicalRegister(SrcReg) ||
-      !isTerminalReg(DstReg, Copy, MRI))
+      Register::isPhysicalRegister(SrcReg) || !isTerminalReg(DstReg, Copy, MRI))
     return false;
 
   // DstReg is a terminal node. Check if it interferes with any other
@@ -3554,7 +3553,7 @@ bool RegisterCoalescer::applyTerminalRule(const MachineInstr &Copy) const {
     if (OtherReg == SrcReg)
       OtherReg = OtherSrcReg;
     // Check if OtherReg is a non-terminal.
-    if (TargetRegisterInfo::isPhysicalRegister(OtherReg) ||
+    if (Register::isPhysicalRegister(OtherReg) ||
         isTerminalReg(OtherReg, MI, MRI))
       continue;
     // Check that OtherReg interfere with DstReg.
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 7d9b3aa9b2d7..bf192d1c530d 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -134,6 +134,22 @@ void PressureDiff::dump(const TargetRegisterInfo &TRI) const {
   }
   dbgs() << '\n';
 }
+
+LLVM_DUMP_METHOD
+void PressureChange::dump() const {
+  dbgs() << "[" << getPSetOrMax() << ", " << getUnitInc() << "]\n";
+}
+
+void RegPressureDelta::dump() const {
+  dbgs() << "[Excess=";
+  Excess.dump();
+  dbgs() << ", CriticalMax=";
+  CriticalMax.dump();
+  dbgs() << ", CurrentMax=";
+  CurrentMax.dump();
+  dbgs() << "]\n";
+}
+
 #endif
 
 void RegPressureTracker::increaseRegPressure(unsigned RegUnit,
@@ -219,7 +235,7 @@ void LiveRegSet::clear() {
 }
 
 static const LiveRange *getLiveRange(const LiveIntervals &LIS, unsigned Reg) {
-  if (TargetRegisterInfo::isVirtualRegister(Reg))
+  if (Register::isVirtualRegister(Reg))
     return &LIS.getInterval(Reg);
   return LIS.getCachedRegUnit(Reg);
 }
@@ -345,7 +361,7 @@ void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) {
   assert(isBottomClosed() && "need bottom-up tracking to intialize.");
   for (const RegisterMaskPair &Pair : P.LiveOutRegs) {
     unsigned RegUnit = Pair.RegUnit;
-    if (TargetRegisterInfo::isVirtualRegister(RegUnit)
+    if (Register::isVirtualRegister(RegUnit)
         && !RPTracker.hasUntiedDef(RegUnit))
       increaseSetPressure(LiveThruPressure, *MRI, RegUnit,
                           LaneBitmask::getNone(), Pair.LaneMask);
@@ -406,7 +422,7 @@ static LaneBitmask getLanesWithProperty(const LiveIntervals &LIS,
     const MachineRegisterInfo &MRI, bool TrackLaneMasks, unsigned RegUnit,
     SlotIndex Pos, LaneBitmask SafeDefault,
     bool(*Property)(const LiveRange &LR, SlotIndex Pos)) {
-  if (TargetRegisterInfo::isVirtualRegister(RegUnit)) {
+  if (Register::isVirtualRegister(RegUnit)) {
     const LiveInterval &LI = LIS.getInterval(RegUnit);
     LaneBitmask Result;
     if (TrackLaneMasks && LI.hasSubRanges()) {
@@ -483,7 +499,7 @@ class RegisterOperandsCollector {
   void collectOperand(const MachineOperand &MO) const {
     if (!MO.isReg() || !MO.getReg())
       return;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (MO.isUse()) {
       if (!MO.isUndef() && !MO.isInternalRead())
         pushReg(Reg, RegOpers.Uses);
@@ -503,7 +519,7 @@ class RegisterOperandsCollector {
 
   void pushReg(unsigned Reg,
                SmallVectorImpl<RegisterMaskPair> &RegUnits) const {
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (Register::isVirtualRegister(Reg)) {
       addRegLanes(RegUnits, RegisterMaskPair(Reg, LaneBitmask::getAll()));
     } else if (MRI.isAllocatable(Reg)) {
       for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
@@ -514,7 +530,7 @@ class RegisterOperandsCollector {
   void collectOperandLanes(const MachineOperand &MO) const {
     if (!MO.isReg() || !MO.getReg())
       return;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     unsigned SubRegIdx = MO.getSubReg();
     if (MO.isUse()) {
       if (!MO.isUndef() && !MO.isInternalRead())
@@ -535,7 +551,7 @@ class RegisterOperandsCollector {
 
   void pushRegLanes(unsigned Reg, unsigned SubRegIdx,
                     SmallVectorImpl<RegisterMaskPair> &RegUnits) const {
-    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (Register::isVirtualRegister(Reg)) {
       LaneBitmask LaneMask = SubRegIdx != 0
                              ? TRI.getSubRegIndexLaneMask(SubRegIdx)
                              : MRI.getMaxLaneMaskForVReg(Reg);
@@ -590,7 +606,7 @@ void RegisterOperands::adjustLaneLiveness(const LiveIntervals &LIS,
     // If the def is all that is live after the instruction, then in case
     // of a subregister def we need a read-undef flag.
     unsigned RegUnit = I->RegUnit;
-    if (TargetRegisterInfo::isVirtualRegister(RegUnit) &&
+    if (Register::isVirtualRegister(RegUnit) &&
         AddFlagsMI != nullptr && (LiveAfter & ~I->LaneMask).none())
       AddFlagsMI->setRegisterDefReadUndef(RegUnit);
 
@@ -616,7 +632,7 @@ void RegisterOperands::adjustLaneLiveness(const LiveIntervals &LIS,
   if (AddFlagsMI != nullptr) {
     for (const RegisterMaskPair &P : DeadDefs) {
       unsigned RegUnit = P.RegUnit;
-      if (!TargetRegisterInfo::isVirtualRegister(RegUnit))
+      if (!Register::isVirtualRegister(RegUnit))
         continue;
       LaneBitmask LiveAfter = getLiveLanesAt(LIS, MRI, true, RegUnit,
                                              Pos.getDeadSlot());
@@ -825,7 +841,7 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers,
   if (TrackUntiedDefs) {
     for (const RegisterMaskPair &Def : RegOpers.Defs) {
       unsigned RegUnit = Def.RegUnit;
-      if (TargetRegisterInfo::isVirtualRegister(RegUnit) &&
+      if (Register::isVirtualRegister(RegUnit) &&
           (LiveRegs.contains(RegUnit) & Def.LaneMask).none())
         UntiedDefs.insert(RegUnit);
     }
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index bb19110e6d70..ec0868acab38 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -49,7 +49,7 @@ using namespace llvm;
 
 STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
 
-void RegScavenger::setRegUsed(unsigned Reg, LaneBitmask LaneMask) {
+void RegScavenger::setRegUsed(Register Reg, LaneBitmask LaneMask) {
   LiveUnits.addRegMasked(Reg, LaneMask);
 }
 
@@ -96,12 +96,12 @@ void RegScavenger::enterBasicBlockEnd(MachineBasicBlock &MBB) {
   }
 }
 
-void RegScavenger::addRegUnits(BitVector &BV, unsigned Reg) {
+void RegScavenger::addRegUnits(BitVector &BV, Register Reg) {
   for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
     BV.set(*RUI);
 }
 
-void RegScavenger::removeRegUnits(BitVector &BV, unsigned Reg) {
+void RegScavenger::removeRegUnits(BitVector &BV, Register Reg) {
   for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
     BV.reset(*RUI);
 }
@@ -133,8 +133,8 @@ void RegScavenger::determineKillsAndDefs() {
     }
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isPhysicalRegister(Reg) || isReserved(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isPhysicalRegister(Reg) || isReserved(Reg))
       continue;
 
     if (MO.isUse()) {
@@ -204,8 +204,8 @@ void RegScavenger::forward() {
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg())
       continue;
-    unsigned Reg = MO.getReg();
-    if (!TargetRegisterInfo::isPhysicalRegister(Reg) || isReserved(Reg))
+    Register Reg = MO.getReg();
+    if (!Register::isPhysicalRegister(Reg) || isReserved(Reg))
       continue;
     if (MO.isUse()) {
       if (MO.isUndef())
@@ -278,14 +278,14 @@ void RegScavenger::backward() {
     --MBBI;
 }
 
-bool RegScavenger::isRegUsed(unsigned Reg, bool includeReserved) const {
+bool RegScavenger::isRegUsed(Register Reg, bool includeReserved) const {
   if (isReserved(Reg))
     return includeReserved;
   return !LiveUnits.available(Reg);
 }
 
-unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
-  for (unsigned Reg : *RC) {
+Register RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
+  for (Register Reg : *RC) {
     if (!isRegUsed(Reg)) {
       LLVM_DEBUG(dbgs() << "Scavenger found unused reg: " << printReg(Reg, TRI)
                         << "\n");
@@ -297,13 +297,13 @@ unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
 
 BitVector RegScavenger::getRegsAvailable(const TargetRegisterClass *RC) {
   BitVector Mask(TRI->getNumRegs());
-  for (unsigned Reg : *RC)
+  for (Register Reg : *RC)
     if (!isRegUsed(Reg))
       Mask.set(Reg);
   return Mask;
 }
 
-unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
+Register RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
                                        BitVector &Candidates,
                                        unsigned InstrLimit,
                                        MachineBasicBlock::iterator &UseMI) {
@@ -329,7 +329,7 @@ unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
         Candidates.clearBitsNotInMask(MO.getRegMask());
       if (!MO.isReg() || MO.isUndef() || !MO.getReg())
         continue;
-      if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+      if (Register::isVirtualRegister(MO.getReg())) {
         if (MO.isDef())
           isVirtDefInsn = true;
         else if (MO.isKill())
@@ -430,7 +430,7 @@ findSurvivorBackwards(const MachineRegisterInfo &MRI,
       // be usefull for this other vreg as well later.
       bool FoundVReg = false;
       for (const MachineOperand &MO : MI.operands()) {
-        if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+        if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) {
           FoundVReg = true;
           break;
         }
@@ -457,7 +457,7 @@ static unsigned getFrameIndexOperandNum(MachineInstr &MI) {
 }
 
 RegScavenger::ScavengedInfo &
-RegScavenger::spill(unsigned Reg, const TargetRegisterClass &RC, int SPAdj,
+RegScavenger::spill(Register Reg, const TargetRegisterClass &RC, int SPAdj,
                     MachineBasicBlock::iterator Before,
                     MachineBasicBlock::iterator &UseMI) {
   // Find an available scavenging slot with size and alignment matching
@@ -531,7 +531,7 @@ RegScavenger::spill(unsigned Reg, const TargetRegisterClass &RC, int SPAdj,
   return Scavenged[SI];
 }
 
-unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
+Register RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
                                         MachineBasicBlock::iterator I,
                                         int SPAdj, bool AllowSpill) {
   MachineInstr &MI = *I;
@@ -542,7 +542,7 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
   // Exclude all the registers being used by the instruction.
   for (const MachineOperand &MO : MI.operands()) {
     if (MO.isReg() && MO.getReg() != 0 && !(MO.isUse() && MO.isUndef()) &&
-        !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        !Register::isVirtualRegister(MO.getReg()))
       for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI)
         Candidates.reset(*AI);
   }
@@ -556,7 +556,7 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
 
   // Find the register whose use is furthest away.
   MachineBasicBlock::iterator UseMI;
-  unsigned SReg = findSurvivorReg(I, Candidates, 25, UseMI);
+  Register SReg = findSurvivorReg(I, Candidates, 25, UseMI);
 
   // If we found an unused register there is no reason to spill it.
   if (!isRegUsed(SReg)) {
@@ -576,7 +576,7 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
   return SReg;
 }
 
-unsigned RegScavenger::scavengeRegisterBackwards(const TargetRegisterClass &RC,
+Register RegScavenger::scavengeRegisterBackwards(const TargetRegisterClass &RC,
                                                  MachineBasicBlock::iterator To,
                                                  bool RestoreAfter, int SPAdj,
                                                  bool AllowSpill) {
@@ -620,8 +620,8 @@ unsigned RegScavenger::scavengeRegisterBackwards(const TargetRegisterClass &RC,
 /// \p ReserveAfter controls whether the scavenged register needs to be reserved
 /// after the current instruction, otherwise it will only be reserved before the
 /// current instruction.
-static unsigned scavengeVReg(MachineRegisterInfo &MRI, RegScavenger &RS,
-                             unsigned VReg, bool ReserveAfter) {
+static Register scavengeVReg(MachineRegisterInfo &MRI, RegScavenger &RS,
+                             Register VReg, bool ReserveAfter) {
   const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
 #ifndef NDEBUG
   // Verify that all definitions and uses are in the same basic block.
@@ -664,7 +664,7 @@ static unsigned scavengeVReg(MachineRegisterInfo &MRI, RegScavenger &RS,
   // spill/reload if necessary.
   int SPAdj = 0;
   const TargetRegisterClass &RC = *MRI.getRegClass(VReg);
-  unsigned SReg = RS.scavengeRegisterBackwards(RC, DefMI.getIterator(),
+  Register SReg = RS.scavengeRegisterBackwards(RC, DefMI.getIterator(),
                                                ReserveAfter, SPAdj);
   MRI.replaceRegWith(VReg, SReg);
   ++NumScavengedRegs;
@@ -694,17 +694,17 @@ static bool scavengeFrameVirtualRegsInBlock(MachineRegisterInfo &MRI,
       for (const MachineOperand &MO : NMI.operands()) {
         if (!MO.isReg())
           continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         // We only care about virtual registers and ignore virtual registers
         // created by the target callbacks in the process (those will be handled
         // in a scavenging round).
-        if (!TargetRegisterInfo::isVirtualRegister(Reg) ||
-            TargetRegisterInfo::virtReg2Index(Reg) >= InitialNumVirtRegs)
+        if (!Register::isVirtualRegister(Reg) ||
+            Register::virtReg2Index(Reg) >= InitialNumVirtRegs)
           continue;
         if (!MO.readsReg())
           continue;
 
-        unsigned SReg = scavengeVReg(MRI, RS, Reg, true);
+        Register SReg = scavengeVReg(MRI, RS, Reg, true);
         N->addRegisterKilled(SReg, &TRI, false);
         RS.setRegUsed(SReg);
       }
@@ -716,10 +716,10 @@ static bool scavengeFrameVirtualRegsInBlock(MachineRegisterInfo &MRI,
     for (const MachineOperand &MO : MI.operands()) {
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
+      Register Reg = MO.getReg();
       // Only vregs, no newly created vregs (see above).
-      if (!TargetRegisterInfo::isVirtualRegister(Reg) ||
-          TargetRegisterInfo::virtReg2Index(Reg) >= InitialNumVirtRegs)
+      if (!Register::isVirtualRegister(Reg) ||
+          Register::virtReg2Index(Reg) >= InitialNumVirtRegs)
         continue;
       // We have to look at all operands anyway so we can precalculate here
       // whether there is a reading operand. This allows use to skip the use
@@ -730,14 +730,14 @@ static bool scavengeFrameVirtualRegsInBlock(MachineRegisterInfo &MRI,
         NextInstructionReadsVReg = true;
       }
       if (MO.isDef()) {
-        unsigned SReg = scavengeVReg(MRI, RS, Reg, false);
+        Register SReg = scavengeVReg(MRI, RS, Reg, false);
         I->addRegisterDead(SReg, &TRI, false);
       }
     }
   }
 #ifndef NDEBUG
   for (const MachineOperand &MO : MBB.front().operands()) {
-    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
       continue;
     assert(!MO.isInternalRead() && "Cannot assign inside bundles");
     assert((!MO.isUndef() || MO.isDef()) && "Cannot handle undef uses");
diff --git a/lib/CodeGen/RenameIndependentSubregs.cpp b/lib/CodeGen/RenameIndependentSubregs.cpp
index 22cff48c3051..e3f5abb6301f 100644
--- a/lib/CodeGen/RenameIndependentSubregs.cpp
+++ b/lib/CodeGen/RenameIndependentSubregs.cpp
@@ -138,7 +138,7 @@ bool RenameIndependentSubregs::renameComponents(LiveInterval &LI) const {
   LLVM_DEBUG(dbgs() << printReg(Reg) << ": Splitting into newly created:");
   for (unsigned I = 1, NumClasses = Classes.getNumClasses(); I < NumClasses;
        ++I) {
-    unsigned NewVReg = MRI->createVirtualRegister(RegClass);
+    Register NewVReg = MRI->createVirtualRegister(RegClass);
     LiveInterval &NewLI = LIS->createEmptyInterval(NewVReg);
     Intervals.push_back(&NewLI);
     LLVM_DEBUG(dbgs() << ' ' << printReg(NewVReg));
@@ -390,7 +390,7 @@ bool RenameIndependentSubregs::runOnMachineFunction(MachineFunction &MF) {
   // there can't be any further splitting.
   bool Changed = false;
   for (size_t I = 0, E = MRI->getNumVirtRegs(); I < E; ++I) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    unsigned Reg = Register::index2VirtReg(I);
     if (!LIS->hasInterval(Reg))
       continue;
     LiveInterval &LI = LIS->getInterval(Reg);
diff --git a/lib/CodeGen/SafeStack.cpp b/lib/CodeGen/SafeStack.cpp
index a6bc7330e2cc..ddbbd0f8d6e9 100644
--- a/lib/CodeGen/SafeStack.cpp
+++ b/lib/CodeGen/SafeStack.cpp
@@ -871,7 +871,7 @@ public:
       report_fatal_error("TargetLowering instance is required");
 
     auto *DL = &F.getParent()->getDataLayout();
-    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
     auto &ACT = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
 
     // Compute DT and LI only for functions that have the attribute.
diff --git a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
index 7776dffb4e9c..b4037499d7d1 100644
--- a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
+++ b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
@@ -173,15 +173,30 @@ static void scalarizeMaskedLoad(CallInst *CI, bool &ModifiedDT) {
     return;
   }
 
+  // If the mask is not v1i1, use scalar bit test operations. This generates
+  // better results on X86 at least.
+  Value *SclrMask;
+  if (VectorWidth != 1) {
+    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+  }
+
   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
     // Fill the "else" block, created in the previous iteration
     //
     //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
-    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  %mask_1 = and i16 %scalar_mask, i32 1 << Idx
+    //  %cond = icmp ne i16 %mask_1, 0
     //  br i1 %mask_1, label %cond.load, label %else
     //
-
-    Value *Predicate = Builder.CreateExtractElement(Mask, Idx);
+    Value *Predicate;
+    if (VectorWidth != 1) {
+      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+                                       Builder.getIntN(VectorWidth, 0));
+    } else {
+      Predicate = Builder.CreateExtractElement(Mask, Idx);
+    }
 
     // Create "cond" block
     //
@@ -290,13 +305,29 @@ static void scalarizeMaskedStore(CallInst *CI, bool &ModifiedDT) {
     return;
   }
 
+  // If the mask is not v1i1, use scalar bit test operations. This generates
+  // better results on X86 at least.
+  Value *SclrMask;
+  if (VectorWidth != 1) {
+    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+  }
+
   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
     // Fill the "else" block, created in the previous iteration
     //
-    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  %mask_1 = and i16 %scalar_mask, i32 1 << Idx
+    //  %cond = icmp ne i16 %mask_1, 0
     //  br i1 %mask_1, label %cond.store, label %else
     //
-    Value *Predicate = Builder.CreateExtractElement(Mask, Idx);
+    Value *Predicate;
+    if (VectorWidth != 1) {
+      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+                                       Builder.getIntN(VectorWidth, 0));
+    } else {
+      Predicate = Builder.CreateExtractElement(Mask, Idx);
+    }
 
     // Create "cond" block
     //
@@ -392,15 +423,30 @@ static void scalarizeMaskedGather(CallInst *CI, bool &ModifiedDT) {
     return;
   }
 
+  // If the mask is not v1i1, use scalar bit test operations. This generates
+  // better results on X86 at least.
+  Value *SclrMask;
+  if (VectorWidth != 1) {
+    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+  }
+
   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
     // Fill the "else" block, created in the previous iteration
     //
-    //  %Mask1 = extractelement <16 x i1> %Mask, i32 1
+    //  %Mask1 = and i16 %scalar_mask, i32 1 << Idx
+    //  %cond = icmp ne i16 %mask_1, 0
     //  br i1 %Mask1, label %cond.load, label %else
     //
 
-    Value *Predicate =
-        Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+    Value *Predicate;
+    if (VectorWidth != 1) {
+      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+                                       Builder.getIntN(VectorWidth, 0));
+    } else {
+      Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+    }
 
     // Create "cond" block
     //
@@ -499,14 +545,29 @@ static void scalarizeMaskedScatter(CallInst *CI, bool &ModifiedDT) {
     return;
   }
 
+  // If the mask is not v1i1, use scalar bit test operations. This generates
+  // better results on X86 at least.
+  Value *SclrMask;
+  if (VectorWidth != 1) {
+    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+  }
+
   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
     // Fill the "else" block, created in the previous iteration
     //
-    //  %Mask1 = extractelement <16 x i1> %Mask, i32 Idx
+    //  %Mask1 = and i16 %scalar_mask, i32 1 << Idx
+    //  %cond = icmp ne i16 %mask_1, 0
     //  br i1 %Mask1, label %cond.store, label %else
     //
-    Value *Predicate =
-        Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+    Value *Predicate;
+    if (VectorWidth != 1) {
+      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+                                       Builder.getIntN(VectorWidth, 0));
+    } else {
+      Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+    }
 
     // Create "cond" block
     //
@@ -555,6 +616,32 @@ static void scalarizeMaskedExpandLoad(CallInst *CI, bool &ModifiedDT) {
   // The result vector
   Value *VResult = PassThru;
 
+  // Shorten the way if the mask is a vector of constants.
+  if (isConstantIntVector(Mask)) {
+    unsigned MemIndex = 0;
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
+        continue;
+      Value *NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex);
+      LoadInst *Load =
+          Builder.CreateAlignedLoad(EltTy, NewPtr, 1, "Load" + Twine(Idx));
+      VResult =
+          Builder.CreateInsertElement(VResult, Load, Idx, "Res" + Twine(Idx));
+      ++MemIndex;
+    }
+    CI->replaceAllUsesWith(VResult);
+    CI->eraseFromParent();
+    return;
+  }
+
+  // If the mask is not v1i1, use scalar bit test operations. This generates
+  // better results on X86 at least.
+  Value *SclrMask;
+  if (VectorWidth != 1) {
+    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+  }
+
   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
     // Fill the "else" block, created in the previous iteration
     //
@@ -563,8 +650,14 @@ static void scalarizeMaskedExpandLoad(CallInst *CI, bool &ModifiedDT) {
     //  br i1 %mask_1, label %cond.load, label %else
     //
 
-    Value *Predicate =
-        Builder.CreateExtractElement(Mask, Idx);
+    Value *Predicate;
+    if (VectorWidth != 1) {
+      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+                                       Builder.getIntN(VectorWidth, 0));
+    } else {
+      Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+    }
 
     // Create "cond" block
     //
@@ -633,13 +726,44 @@ static void scalarizeMaskedCompressStore(CallInst *CI, bool &ModifiedDT) {
 
   unsigned VectorWidth = VecType->getNumElements();
 
+  // Shorten the way if the mask is a vector of constants.
+  if (isConstantIntVector(Mask)) {
+    unsigned MemIndex = 0;
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
+        continue;
+      Value *OneElt =
+          Builder.CreateExtractElement(Src, Idx, "Elt" + Twine(Idx));
+      Value *NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex);
+      Builder.CreateAlignedStore(OneElt, NewPtr, 1);
+      ++MemIndex;
+    }
+    CI->eraseFromParent();
+    return;
+  }
+
+  // If the mask is not v1i1, use scalar bit test operations. This generates
+  // better results on X86 at least.
+  Value *SclrMask;
+  if (VectorWidth != 1) {
+    Type *SclrMaskTy = Builder.getIntNTy(VectorWidth);
+    SclrMask = Builder.CreateBitCast(Mask, SclrMaskTy, "scalar_mask");
+  }
+
   for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
     // Fill the "else" block, created in the previous iteration
     //
     //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
     //  br i1 %mask_1, label %cond.store, label %else
     //
-    Value *Predicate = Builder.CreateExtractElement(Mask, Idx);
+    Value *Predicate;
+    if (VectorWidth != 1) {
+      Value *Mask = Builder.getInt(APInt::getOneBitSet(VectorWidth, Idx));
+      Predicate = Builder.CreateICmpNE(Builder.CreateAnd(SclrMask, Mask),
+                                       Builder.getIntN(VectorWidth, 0));
+    } else {
+      Predicate = Builder.CreateExtractElement(Mask, Idx, "Mask" + Twine(Idx));
+    }
 
     // Create "cond" block
     //
@@ -727,17 +851,24 @@ bool ScalarizeMaskedMemIntrin::optimizeCallInst(CallInst *CI,
     switch (II->getIntrinsicID()) {
     default:
       break;
-    case Intrinsic::masked_load:
+    case Intrinsic::masked_load: {
       // Scalarize unsupported vector masked load
-      if (TTI->isLegalMaskedLoad(CI->getType()))
+      unsigned Alignment =
+        cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+      if (TTI->isLegalMaskedLoad(CI->getType(), MaybeAlign(Alignment)))
         return false;
       scalarizeMaskedLoad(CI, ModifiedDT);
       return true;
-    case Intrinsic::masked_store:
-      if (TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType()))
+    }
+    case Intrinsic::masked_store: {
+      unsigned Alignment =
+        cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+      if (TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType(),
+                                  MaybeAlign(Alignment)))
         return false;
       scalarizeMaskedStore(CI, ModifiedDT);
       return true;
+    }
     case Intrinsic::masked_gather:
       if (TTI->isLegalMaskedGather(CI->getType()))
         return false;
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index d5ad7e92299d..96a1f86c3e04 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -18,7 +18,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -205,10 +204,10 @@ void ScheduleDAGInstrs::addSchedBarrierDeps() {
   if (ExitMI) {
     for (const MachineOperand &MO : ExitMI->operands()) {
       if (!MO.isReg() || MO.isDef()) continue;
-      unsigned Reg = MO.getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      Register Reg = MO.getReg();
+      if (Register::isPhysicalRegister(Reg)) {
         Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg));
-      } else if (TargetRegisterInfo::isVirtualRegister(Reg) && MO.readsReg()) {
+      } else if (Register::isVirtualRegister(Reg) && MO.readsReg()) {
         addVRegUseDeps(&ExitSU, ExitMI->getOperandNo(&MO));
       }
     }
@@ -285,7 +284,7 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
 void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
   MachineInstr *MI = SU->getInstr();
   MachineOperand &MO = MI->getOperand(OperIdx);
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
   // We do not need to track any dependencies for constant registers.
   if (MRI.isConstantPhysReg(Reg))
     return;
@@ -361,7 +360,7 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
 
 LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
 {
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
   // No point in tracking lanemasks if we don't have interesting subregisters.
   const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
   if (!RC.HasDisjunctSubRegs)
@@ -373,6 +372,13 @@ LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
   return TRI->getSubRegIndexLaneMask(SubReg);
 }
 
+bool ScheduleDAGInstrs::deadDefHasNoUse(const MachineOperand &MO) {
+  auto RegUse = CurrentVRegUses.find(MO.getReg());
+  if (RegUse == CurrentVRegUses.end())
+    return true;
+  return (RegUse->LaneMask & getLaneMaskForMO(MO)).none();
+}
+
 /// Adds register output and data dependencies from this SUnit to instructions
 /// that occur later in the same scheduling region if they read from or write to
 /// the virtual register defined at OperIdx.
@@ -382,7 +388,7 @@ LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
 void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
   MachineInstr *MI = SU->getInstr();
   MachineOperand &MO = MI->getOperand(OperIdx);
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
 
   LaneBitmask DefLaneMask;
   LaneBitmask KillLaneMask;
@@ -393,6 +399,18 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
     // earlier instruction.
     KillLaneMask = IsKill ? LaneBitmask::getAll() : DefLaneMask;
 
+    if (MO.getSubReg() != 0 && MO.isUndef()) {
+      // There may be other subregister defs on the same instruction of the same
+      // register in later operands. The lanes of other defs will now be live
+      // after this instruction, so these should not be treated as killed by the
+      // instruction even though they appear to be killed in this one operand.
+      for (int I = OperIdx + 1, E = MI->getNumOperands(); I != E; ++I) {
+        const MachineOperand &OtherMO = MI->getOperand(I);
+        if (OtherMO.isReg() && OtherMO.isDef() && OtherMO.getReg() == Reg)
+          KillLaneMask &= ~getLaneMaskForMO(OtherMO);
+      }
+    }
+
     // Clear undef flag, we'll re-add it later once we know which subregister
     // Def is first.
     MO.setIsUndef(false);
@@ -402,8 +420,7 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
   }
 
   if (MO.isDead()) {
-    assert(CurrentVRegUses.find(Reg) == CurrentVRegUses.end() &&
-           "Dead defs should have no uses");
+    assert(deadDefHasNoUse(MO) && "Dead defs should have no uses");
   } else {
     // Add data dependence to all uses we found so far.
     const TargetSubtargetInfo &ST = MF.getSubtarget();
@@ -491,7 +508,7 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
 void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
   const MachineInstr *MI = SU->getInstr();
   const MachineOperand &MO = MI->getOperand(OperIdx);
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
 
   // Remember the use. Data dependencies will be added when we find the def.
   LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO)
@@ -514,7 +531,7 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
 
 /// Returns true if MI is an instruction we are unable to reason about
 /// (like a call or something with unmodeled side effects).
-static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) {
+static inline bool isGlobalMemoryObject(AAResults *AA, MachineInstr *MI) {
   return MI->isCall() || MI->hasUnmodeledSideEffects() ||
          (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad(AA));
 }
@@ -701,7 +718,7 @@ void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) {
   map.reComputeSize();
 }
 
-void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
+void ScheduleDAGInstrs::buildSchedGraph(AAResults *AA,
                                         RegPressureTracker *RPTracker,
                                         PressureDiffs *PDiffs,
                                         LiveIntervals *LIS,
@@ -821,10 +838,10 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       const MachineOperand &MO = MI.getOperand(j);
       if (!MO.isReg() || !MO.isDef())
         continue;
-      unsigned Reg = MO.getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      Register Reg = MO.getReg();
+      if (Register::isPhysicalRegister(Reg)) {
         addPhysRegDeps(SU, j);
-      } else if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      } else if (Register::isVirtualRegister(Reg)) {
         HasVRegDef = true;
         addVRegDefDeps(SU, j);
       }
@@ -838,10 +855,10 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       // additional use dependencies.
       if (!MO.isReg() || !MO.isUse())
         continue;
-      unsigned Reg = MO.getReg();
-      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      Register Reg = MO.getReg();
+      if (Register::isPhysicalRegister(Reg)) {
         addPhysRegDeps(SU, j);
-      } else if (TargetRegisterInfo::isVirtualRegister(Reg) && MO.readsReg()) {
+      } else if (Register::isVirtualRegister(Reg) && MO.readsReg()) {
         addVRegUseDeps(SU, j);
       }
     }
@@ -1071,7 +1088,7 @@ static void toggleKills(const MachineRegisterInfo &MRI, LivePhysRegs &LiveRegs,
   for (MachineOperand &MO : MI.operands()) {
     if (!MO.isReg() || !MO.readsReg())
       continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (!Reg)
       continue;
 
@@ -1102,7 +1119,7 @@ void ScheduleDAGInstrs::fixupKills(MachineBasicBlock &MBB) {
       if (MO.isReg()) {
         if (!MO.isDef())
           continue;
-        unsigned Reg = MO.getReg();
+        Register Reg = MO.getReg();
         if (!Reg)
           continue;
         LiveRegs.removeReg(Reg);
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 49c922f560fa..e8950b58d42d 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -24,7 +24,6 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -111,10 +110,20 @@ static cl::opt<bool>
   MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true),
                     cl::desc("DAG combiner may split indexing from loads"));
 
+static cl::opt<bool>
+    EnableStoreMerging("combiner-store-merging", cl::Hidden, cl::init(true),
+                       cl::desc("DAG combiner enable merging multiple stores "
+                                "into a wider store"));
+
 static cl::opt<unsigned> TokenFactorInlineLimit(
     "combiner-tokenfactor-inline-limit", cl::Hidden, cl::init(2048),
     cl::desc("Limit the number of operands to inline for Token Factors"));
 
+static cl::opt<unsigned> StoreMergeDependenceLimit(
+    "combiner-store-merge-dependence-limit", cl::Hidden, cl::init(10),
+    cl::desc("Limit the number of times for the same StoreNode and RootNode "
+             "to bail out in store merging dependence check"));
+
 namespace {
 
   class DAGCombiner {
@@ -152,6 +161,14 @@ namespace {
     /// which have not yet been combined to the worklist.
     SmallPtrSet<SDNode *, 32> CombinedNodes;
 
+    /// Map from candidate StoreNode to the pair of RootNode and count.
+    /// The count is used to track how many times we have seen the StoreNode
+    /// with the same RootNode bail out in dependence check. If we have seen
+    /// the bail out for the same pair many times over a limit, we won't
+    /// consider the StoreNode with the same RootNode as store merging
+    /// candidate again.
+    DenseMap<SDNode *, std::pair<SDNode *, unsigned>> StoreRootCountMap;
+
     // AA - Used for DAG load/store alias analysis.
     AliasAnalysis *AA;
 
@@ -236,6 +253,7 @@ namespace {
     void removeFromWorklist(SDNode *N) {
       CombinedNodes.erase(N);
       PruningList.remove(N);
+      StoreRootCountMap.erase(N);
 
       auto It = WorklistMap.find(N);
       if (It == WorklistMap.end())
@@ -361,6 +379,7 @@ namespace {
     SDValue visitSUBE(SDNode *N);
     SDValue visitSUBCARRY(SDNode *N);
     SDValue visitMUL(SDNode *N);
+    SDValue visitMULFIX(SDNode *N);
     SDValue useDivRem(SDNode *N);
     SDValue visitSDIV(SDNode *N);
     SDValue visitSDIVLike(SDValue N0, SDValue N1, SDNode *N);
@@ -421,7 +440,6 @@ namespace {
     SDValue visitFP_TO_SINT(SDNode *N);
     SDValue visitFP_TO_UINT(SDNode *N);
     SDValue visitFP_ROUND(SDNode *N);
-    SDValue visitFP_ROUND_INREG(SDNode *N);
     SDValue visitFP_EXTEND(SDNode *N);
     SDValue visitFNEG(SDNode *N);
     SDValue visitFABS(SDNode *N);
@@ -470,7 +488,7 @@ namespace {
     SDValue reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
                            SDValue N1, SDNodeFlags Flags);
 
-    SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt);
+    SDValue visitShiftByConstant(SDNode *N);
 
     SDValue foldSelectOfConstants(SDNode *N);
     SDValue foldVSelectOfConstants(SDNode *N);
@@ -497,6 +515,7 @@ namespace {
     bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
                            SDValue &CC) const;
     bool isOneUseSetCC(SDValue N) const;
+    bool isCheaperToUseNegatedFPOps(SDValue X, SDValue Y);
 
     SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
                                          unsigned HiOp);
@@ -510,7 +529,7 @@ namespace {
     SDValue BuildSDIVPow2(SDNode *N);
     SDValue BuildUDIV(SDNode *N);
     SDValue BuildLogBase2(SDValue V, const SDLoc &DL);
-    SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags);
+    SDValue BuildDivEstimate(SDValue N, SDValue Op, SDNodeFlags Flags);
     SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags Flags);
     SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags Flags);
     SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags, bool Recip);
@@ -521,11 +540,11 @@ namespace {
     SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
                                bool DemandHighBits = true);
     SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
-    SDNode *MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
+    SDValue MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
                               SDValue InnerPos, SDValue InnerNeg,
                               unsigned PosOpcode, unsigned NegOpcode,
                               const SDLoc &DL);
-    SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
+    SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
     SDValue MatchLoadCombine(SDNode *N);
     SDValue MatchStoreCombine(StoreSDNode *N);
     SDValue ReduceLoadWidth(SDNode *N);
@@ -742,6 +761,11 @@ CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) {
   return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo);
 }
 
+bool TargetLowering::DAGCombinerInfo::
+recursivelyDeleteUnusedNodes(SDNode *N) {
+  return ((DAGCombiner*)DC)->recursivelyDeleteUnusedNodes(N);
+}
+
 void TargetLowering::DAGCombinerInfo::
 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
   return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO);
@@ -766,195 +790,6 @@ void DAGCombiner::deleteAndRecombine(SDNode *N) {
   DAG.DeleteNode(N);
 }
 
-/// Return 1 if we can compute the negated form of the specified expression for
-/// the same cost as the expression itself, or 2 if we can compute the negated
-/// form more cheaply than the expression itself.
-static char isNegatibleForFree(SDValue Op, bool LegalOperations,
-                               const TargetLowering &TLI,
-                               const TargetOptions *Options,
-                               bool ForCodeSize,
-                               unsigned Depth = 0) {
-  // fneg is removable even if it has multiple uses.
-  if (Op.getOpcode() == ISD::FNEG)
-    return 2;
-
-  // Don't allow anything with multiple uses unless we know it is free.
-  EVT VT = Op.getValueType();
-  const SDNodeFlags Flags = Op->getFlags();
-  if (!Op.hasOneUse() &&
-      !(Op.getOpcode() == ISD::FP_EXTEND &&
-        TLI.isFPExtFree(VT, Op.getOperand(0).getValueType())))
-    return 0;
-
-  // Don't recurse exponentially.
-  if (Depth > 6)
-    return 0;
-
-  switch (Op.getOpcode()) {
-  default: return false;
-  case ISD::ConstantFP: {
-    if (!LegalOperations)
-      return 1;
-
-    // Don't invert constant FP values after legalization unless the target says
-    // the negated constant is legal.
-    return TLI.isOperationLegal(ISD::ConstantFP, VT) ||
-           TLI.isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT,
-                            ForCodeSize);
-  }
-  case ISD::BUILD_VECTOR: {
-    // Only permit BUILD_VECTOR of constants.
-    if (llvm::any_of(Op->op_values(), [&](SDValue N) {
-          return !N.isUndef() && !isa<ConstantFPSDNode>(N);
-        }))
-      return 0;
-    if (!LegalOperations)
-      return 1;
-    if (TLI.isOperationLegal(ISD::ConstantFP, VT) &&
-        TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
-      return 1;
-    return llvm::all_of(Op->op_values(), [&](SDValue N) {
-      return N.isUndef() ||
-             TLI.isFPImmLegal(neg(cast<ConstantFPSDNode>(N)->getValueAPF()), VT,
-                              ForCodeSize);
-    });
-  }
-  case ISD::FADD:
-    if (!Options->UnsafeFPMath && !Flags.hasNoSignedZeros())
-      return 0;
-
-    // After operation legalization, it might not be legal to create new FSUBs.
-    if (LegalOperations && !TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
-      return 0;
-
-    // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
-    if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
-                                    Options, ForCodeSize, Depth + 1))
-      return V;
-    // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
-    return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
-                              ForCodeSize, Depth + 1);
-  case ISD::FSUB:
-    // We can't turn -(A-B) into B-A when we honor signed zeros.
-    if (!Options->NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
-      return 0;
-
-    // fold (fneg (fsub A, B)) -> (fsub B, A)
-    return 1;
-
-  case ISD::FMUL:
-  case ISD::FDIV:
-    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y))
-    if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
-                                    Options, ForCodeSize, Depth + 1))
-      return V;
-
-    return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
-                              ForCodeSize, Depth + 1);
-
-  case ISD::FP_EXTEND:
-  case ISD::FP_ROUND:
-  case ISD::FSIN:
-    return isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options,
-                              ForCodeSize, Depth + 1);
-  }
-}
-
-/// If isNegatibleForFree returns true, return the newly negated expression.
-static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
-                                    bool LegalOperations, bool ForCodeSize,
-                                    unsigned Depth = 0) {
-  // fneg is removable even if it has multiple uses.
-  if (Op.getOpcode() == ISD::FNEG)
-    return Op.getOperand(0);
-
-  assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree");
-  const TargetOptions &Options = DAG.getTarget().Options;
-  const SDNodeFlags Flags = Op->getFlags();
-
-  switch (Op.getOpcode()) {
-  default: llvm_unreachable("Unknown code");
-  case ISD::ConstantFP: {
-    APFloat V = cast<ConstantFPSDNode>(Op)->getValueAPF();
-    V.changeSign();
-    return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType());
-  }
-  case ISD::BUILD_VECTOR: {
-    SmallVector<SDValue, 4> Ops;
-    for (SDValue C : Op->op_values()) {
-      if (C.isUndef()) {
-        Ops.push_back(C);
-        continue;
-      }
-      APFloat V = cast<ConstantFPSDNode>(C)->getValueAPF();
-      V.changeSign();
-      Ops.push_back(DAG.getConstantFP(V, SDLoc(Op), C.getValueType()));
-    }
-    return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Ops);
-  }
-  case ISD::FADD:
-    assert(Options.UnsafeFPMath || Flags.hasNoSignedZeros());
-
-    // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
-    if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
-                           DAG.getTargetLoweringInfo(), &Options, ForCodeSize,
-                           Depth + 1))
-      return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
-                         GetNegatedExpression(Op.getOperand(0), DAG,
-                                              LegalOperations, ForCodeSize,
-                                              Depth + 1),
-                         Op.getOperand(1), Flags);
-    // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
-    return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
-                       GetNegatedExpression(Op.getOperand(1), DAG,
-                                            LegalOperations, ForCodeSize,
-                                            Depth + 1),
-                       Op.getOperand(0), Flags);
-  case ISD::FSUB:
-    // fold (fneg (fsub 0, B)) -> B
-    if (ConstantFPSDNode *N0CFP =
-            isConstOrConstSplatFP(Op.getOperand(0), /*AllowUndefs*/ true))
-      if (N0CFP->isZero())
-        return Op.getOperand(1);
-
-    // fold (fneg (fsub A, B)) -> (fsub B, A)
-    return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(0), Flags);
-
-  case ISD::FMUL:
-  case ISD::FDIV:
-    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
-    if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
-                           DAG.getTargetLoweringInfo(), &Options, ForCodeSize,
-                           Depth + 1))
-      return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
-                         GetNegatedExpression(Op.getOperand(0), DAG,
-                                              LegalOperations, ForCodeSize,
-                                              Depth + 1),
-                         Op.getOperand(1), Flags);
-
-    // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y))
-    return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
-                       Op.getOperand(0),
-                       GetNegatedExpression(Op.getOperand(1), DAG,
-                                            LegalOperations, ForCodeSize,
-                                            Depth + 1), Flags);
-
-  case ISD::FP_EXTEND:
-  case ISD::FSIN:
-    return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
-                       GetNegatedExpression(Op.getOperand(0), DAG,
-                                            LegalOperations, ForCodeSize,
-                                            Depth + 1));
-  case ISD::FP_ROUND:
-    return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(),
-                       GetNegatedExpression(Op.getOperand(0), DAG,
-                                            LegalOperations, ForCodeSize,
-                                            Depth + 1),
-                       Op.getOperand(1));
-  }
-}
-
 // APInts must be the same size for most operations, this helper
 // function zero extends the shorter of the pair so that they match.
 // We provide an Offset so that we can create bitwidths that won't overflow.
@@ -1124,7 +959,6 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
       SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1);
       if (!OpNode.getNode())
         return SDValue();
-      AddToWorklist(OpNode.getNode());
       return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1));
     }
   }
@@ -1438,7 +1272,6 @@ SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
     SDValue RV =
         DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1));
 
-    AddToWorklist(N0.getNode());
     if (Replace)
       ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode());
 
@@ -1591,8 +1424,8 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
       bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes);
 
       for (SDNode *LN : UpdatedNodes) {
-        AddToWorklist(LN);
         AddUsersToWorklist(LN);
+        AddToWorklist(LN);
       }
       if (!NIsValid)
         continue;
@@ -1673,6 +1506,10 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::ADDCARRY:           return visitADDCARRY(N);
   case ISD::SUBE:               return visitSUBE(N);
   case ISD::SUBCARRY:           return visitSUBCARRY(N);
+  case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:         return visitMULFIX(N);
   case ISD::MUL:                return visitMUL(N);
   case ISD::SDIV:               return visitSDIV(N);
   case ISD::UDIV:               return visitUDIV(N);
@@ -1736,7 +1573,6 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FP_TO_SINT:         return visitFP_TO_SINT(N);
   case ISD::FP_TO_UINT:         return visitFP_TO_UINT(N);
   case ISD::FP_ROUND:           return visitFP_ROUND(N);
-  case ISD::FP_ROUND_INREG:     return visitFP_ROUND_INREG(N);
   case ISD::FP_EXTEND:          return visitFP_EXTEND(N);
   case ISD::FNEG:               return visitFNEG(N);
   case ISD::FABS:               return visitFABS(N);
@@ -3308,6 +3144,18 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     }
   }
 
+  if (TLI.isOperationLegalOrCustom(ISD::ADDCARRY, VT)) {
+    // (sub Carry, X)  ->  (addcarry (sub 0, X), 0, Carry)
+    if (SDValue Carry = getAsCarry(TLI, N0)) {
+      SDValue X = N1;
+      SDValue Zero = DAG.getConstant(0, DL, VT);
+      SDValue NegX = DAG.getNode(ISD::SUB, DL, VT, Zero, X);
+      return DAG.getNode(ISD::ADDCARRY, DL,
+                         DAG.getVTList(VT, Carry.getValueType()), NegX, Zero,
+                         Carry);
+    }
+  }
+
   return SDValue();
 }
 
@@ -3442,6 +3290,30 @@ SDValue DAGCombiner::visitSUBCARRY(SDNode *N) {
   return SDValue();
 }
 
+// Notice that "mulfix" can be any of SMULFIX, SMULFIXSAT, UMULFIX and
+// UMULFIXSAT here.
+SDValue DAGCombiner::visitMULFIX(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue Scale = N->getOperand(2);
+  EVT VT = N0.getValueType();
+
+  // fold (mulfix x, undef, scale) -> 0
+  if (N0.isUndef() || N1.isUndef())
+    return DAG.getConstant(0, SDLoc(N), VT);
+
+  // Canonicalize constant to RHS (vector doesn't have to splat)
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+     !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0, Scale);
+
+  // fold (mulfix x, 0, scale) -> 0
+  if (isNullConstant(N1))
+    return DAG.getConstant(0, SDLoc(N), VT);
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitMUL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -3537,7 +3409,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   //           x * 15 --> (x << 4) - x
   //           x * -33 --> -((x << 5) + x)
   //           x * -15 --> -((x << 4) - x) ; this reduces --> x - (x << 4)
-  if (N1IsConst && TLI.decomposeMulByConstant(VT, N1)) {
+  if (N1IsConst && TLI.decomposeMulByConstant(*DAG.getContext(), VT, N1)) {
     // TODO: We could handle more general decomposition of any constant by
     //       having the target set a limit on number of ops and making a
     //       callback to determine that sequence (similar to sqrt expansion).
@@ -4083,10 +3955,10 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
 
   if (VT.isVector()) {
     // fold (mulhs x, 0) -> 0
-    if (ISD::isBuildVectorAllZeros(N1.getNode()))
-      return N1;
-    if (ISD::isBuildVectorAllZeros(N0.getNode()))
-      return N0;
+    // do not return N0/N1, because undef node may exist.
+    if (ISD::isBuildVectorAllZeros(N0.getNode()) ||
+        ISD::isBuildVectorAllZeros(N1.getNode()))
+      return DAG.getConstant(0, DL, VT);
   }
 
   // fold (mulhs x, 0) -> 0
@@ -4095,7 +3967,7 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
   // fold (mulhs x, 1) -> (sra x, size(x)-1)
   if (isOneConstant(N1))
     return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0,
-                       DAG.getConstant(N0.getValueSizeInBits() - 1, DL,
+                       DAG.getConstant(N0.getScalarValueSizeInBits() - 1, DL,
                                        getShiftAmountTy(N0.getValueType())));
 
   // fold (mulhs x, undef) -> 0
@@ -4130,10 +4002,10 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
 
   if (VT.isVector()) {
     // fold (mulhu x, 0) -> 0
-    if (ISD::isBuildVectorAllZeros(N1.getNode()))
-      return N1;
-    if (ISD::isBuildVectorAllZeros(N0.getNode()))
-      return N0;
+    // do not return N0/N1, because undef node may exist.
+    if (ISD::isBuildVectorAllZeros(N0.getNode()) ||
+        ISD::isBuildVectorAllZeros(N1.getNode()))
+      return DAG.getConstant(0, DL, VT);
   }
 
   // fold (mulhu x, 0) -> 0
@@ -4265,6 +4137,18 @@ SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
+  // (umul_lohi N0, 0) -> (0, 0)
+  if (isNullConstant(N->getOperand(1))) {
+    SDValue Zero = DAG.getConstant(0, DL, VT);
+    return CombineTo(N, Zero, Zero);
+  }
+
+  // (umul_lohi N0, 1) -> (N0, 0)
+  if (isOneConstant(N->getOperand(1))) {
+    SDValue Zero = DAG.getConstant(0, DL, VT);
+    return CombineTo(N, N->getOperand(0), Zero);
+  }
+
   // If the type is twice as wide is legal, transform the mulhu to a wider
   // multiply plus a shift.
   if (VT.isSimple() && !VT.isVector()) {
@@ -4290,13 +4174,29 @@ SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
 }
 
 SDValue DAGCombiner::visitMULO(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
   bool IsSigned = (ISD::SMULO == N->getOpcode());
 
+  EVT CarryVT = N->getValueType(1);
+  SDLoc DL(N);
+
+  // canonicalize constant to RHS.
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(N->getOpcode(), DL, N->getVTList(), N1, N0);
+
+  // fold (mulo x, 0) -> 0 + no carry out
+  if (isNullOrNullSplat(N1))
+    return CombineTo(N, DAG.getConstant(0, DL, VT),
+                     DAG.getConstant(0, DL, CarryVT));
+
   // (mulo x, 2) -> (addo x, x)
-  if (ConstantSDNode *C2 = isConstOrConstSplat(N->getOperand(1)))
+  if (ConstantSDNode *C2 = isConstOrConstSplat(N1))
     if (C2->getAPIntValue() == 2)
-      return DAG.getNode(IsSigned ? ISD::SADDO : ISD::UADDO, SDLoc(N),
-                         N->getVTList(), N->getOperand(0), N->getOperand(0));
+      return DAG.getNode(IsSigned ? ISD::SADDO : ISD::UADDO, DL,
+                         N->getVTList(), N0, N0);
 
   return SDValue();
 }
@@ -4444,7 +4344,9 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
   if ((HandOpcode == ISD::BITCAST || HandOpcode == ISD::SCALAR_TO_VECTOR) &&
        Level <= AfterLegalizeTypes) {
     // Input types must be integer and the same.
-    if (XVT.isInteger() && XVT == Y.getValueType()) {
+    if (XVT.isInteger() && XVT == Y.getValueType() &&
+        !(VT.isVector() && TLI.isTypeLegal(VT) &&
+          !XVT.isVector() && !TLI.isTypeLegal(XVT))) {
       SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
       return DAG.getNode(HandOpcode, DL, VT, Logic);
     }
@@ -4770,8 +4672,8 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
     return true;
   }
 
-  // Do not change the width of a volatile load.
-  if (LoadN->isVolatile())
+  // Do not change the width of a volatile or atomic loads.
+  if (!LoadN->isSimple())
     return false;
 
   // Do not generate loads of non-round integer types since these can
@@ -4803,15 +4705,15 @@ bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST,
   if (!MemVT.isRound())
     return false;
 
-  // Don't change the width of a volatile load.
-  if (LDST->isVolatile())
+  // Don't change the width of a volatile or atomic loads.
+  if (!LDST->isSimple())
     return false;
 
   // Verify that we are actually reducing a load width here.
   if (LDST->getMemoryVT().getSizeInBits() < MemVT.getSizeInBits())
     return false;
 
-  // Ensure that this isn't going to produce an unsupported unaligned access.
+  // Ensure that this isn't going to produce an unsupported memory access.
   if (ShAmt &&
       !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
                               LDST->getAddressSpace(), ShAmt / 8,
@@ -5076,6 +4978,59 @@ SDValue DAGCombiner::unfoldExtremeBitClearingToShifts(SDNode *N) {
   return T1;
 }
 
+/// Try to replace shift/logic that tests if a bit is clear with mask + setcc.
+/// For a target with a bit test, this is expected to become test + set and save
+/// at least 1 instruction.
+static SDValue combineShiftAnd1ToBitTest(SDNode *And, SelectionDAG &DAG) {
+  assert(And->getOpcode() == ISD::AND && "Expected an 'and' op");
+
+  // This is probably not worthwhile without a supported type.
+  EVT VT = And->getValueType(0);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isTypeLegal(VT))
+    return SDValue();
+
+  // Look through an optional extension and find a 'not'.
+  // TODO: Should we favor test+set even without the 'not' op?
+  SDValue Not = And->getOperand(0), And1 = And->getOperand(1);
+  if (Not.getOpcode() == ISD::ANY_EXTEND)
+    Not = Not.getOperand(0);
+  if (!isBitwiseNot(Not) || !Not.hasOneUse() || !isOneConstant(And1))
+    return SDValue();
+
+  // Look though an optional truncation. The source operand may not be the same
+  // type as the original 'and', but that is ok because we are masking off
+  // everything but the low bit.
+  SDValue Srl = Not.getOperand(0);
+  if (Srl.getOpcode() == ISD::TRUNCATE)
+    Srl = Srl.getOperand(0);
+
+  // Match a shift-right by constant.
+  if (Srl.getOpcode() != ISD::SRL || !Srl.hasOneUse() ||
+      !isa<ConstantSDNode>(Srl.getOperand(1)))
+    return SDValue();
+
+  // We might have looked through casts that make this transform invalid.
+  // TODO: If the source type is wider than the result type, do the mask and
+  //       compare in the source type.
+  const APInt &ShiftAmt = Srl.getConstantOperandAPInt(1);
+  unsigned VTBitWidth = VT.getSizeInBits();
+  if (ShiftAmt.uge(VTBitWidth))
+    return SDValue();
+
+  // Turn this into a bit-test pattern using mask op + setcc:
+  // and (not (srl X, C)), 1 --> (and X, 1<<C) == 0
+  SDLoc DL(And);
+  SDValue X = DAG.getZExtOrTrunc(Srl.getOperand(0), DL, VT);
+  EVT CCVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  SDValue Mask = DAG.getConstant(
+      APInt::getOneBitSet(VTBitWidth, ShiftAmt.getZExtValue()), DL, VT);
+  SDValue NewAnd = DAG.getNode(ISD::AND, DL, VT, X, Mask);
+  SDValue Zero = DAG.getConstant(0, DL, VT);
+  SDValue Setcc = DAG.getSetCC(DL, CCVT, NewAnd, Zero, ISD::SETEQ);
+  return DAG.getZExtOrTrunc(Setcc, DL, VT);
+}
+
 SDValue DAGCombiner::visitAND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -5163,6 +5118,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
   }
+
   // similarly fold (and (X (load ([non_ext|any_ext|zero_ext] V))), c) ->
   // (X (load ([non_ext|zero_ext] V))) if 'and' only clears top bits which must
   // already be zero by virtue of the width of the base type of the load.
@@ -5337,7 +5293,7 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     unsigned MemBitSize = MemVT.getScalarSizeInBits();
     APInt ExtBits = APInt::getHighBitsSet(ExtBitSize, ExtBitSize - MemBitSize);
     if (DAG.MaskedValueIsZero(N1, ExtBits) &&
-        ((!LegalOperations && !LN0->isVolatile()) ||
+        ((!LegalOperations && LN0->isSimple()) ||
          TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
       SDValue ExtLoad =
           DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT, LN0->getChain(),
@@ -5358,6 +5314,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   if (SDValue Shifts = unfoldExtremeBitClearingToShifts(N))
     return Shifts;
 
+  if (TLI.hasBitTest(N0, N1))
+    if (SDValue V = combineShiftAnd1ToBitTest(N, DAG))
+      return V;
+
   return SDValue();
 }
 
@@ -5564,6 +5524,23 @@ static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
   return true;
 }
 
+// Match 2 elements of a packed halfword bswap.
+static bool isBSwapHWordPair(SDValue N, MutableArrayRef<SDNode *> Parts) {
+  if (N.getOpcode() == ISD::OR)
+    return isBSwapHWordElement(N.getOperand(0), Parts) &&
+           isBSwapHWordElement(N.getOperand(1), Parts);
+
+  if (N.getOpcode() == ISD::SRL && N.getOperand(0).getOpcode() == ISD::BSWAP) {
+    ConstantSDNode *C = isConstOrConstSplat(N.getOperand(1));
+    if (!C || C->getAPIntValue() != 16)
+      return false;
+    Parts[0] = Parts[1] = N.getOperand(0).getOperand(0).getNode();
+    return true;
+  }
+
+  return false;
+}
+
 /// Match a 32-bit packed halfword bswap. That is
 /// ((x & 0x000000ff) << 8) |
 /// ((x & 0x0000ff00) >> 8) |
@@ -5581,43 +5558,26 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
     return SDValue();
 
   // Look for either
-  // (or (or (and), (and)), (or (and), (and)))
-  // (or (or (or (and), (and)), (and)), (and))
-  if (N0.getOpcode() != ISD::OR)
-    return SDValue();
-  SDValue N00 = N0.getOperand(0);
-  SDValue N01 = N0.getOperand(1);
+  // (or (bswaphpair), (bswaphpair))
+  // (or (or (bswaphpair), (and)), (and))
+  // (or (or (and), (bswaphpair)), (and))
   SDNode *Parts[4] = {};
 
-  if (N1.getOpcode() == ISD::OR &&
-      N00.getNumOperands() == 2 && N01.getNumOperands() == 2) {
+  if (isBSwapHWordPair(N0, Parts)) {
     // (or (or (and), (and)), (or (and), (and)))
-    if (!isBSwapHWordElement(N00, Parts))
+    if (!isBSwapHWordPair(N1, Parts))
       return SDValue();
-
-    if (!isBSwapHWordElement(N01, Parts))
-      return SDValue();
-    SDValue N10 = N1.getOperand(0);
-    if (!isBSwapHWordElement(N10, Parts))
-      return SDValue();
-    SDValue N11 = N1.getOperand(1);
-    if (!isBSwapHWordElement(N11, Parts))
-      return SDValue();
-  } else {
+  } else if (N0.getOpcode() == ISD::OR) {
     // (or (or (or (and), (and)), (and)), (and))
     if (!isBSwapHWordElement(N1, Parts))
       return SDValue();
-    if (!isBSwapHWordElement(N01, Parts))
-      return SDValue();
-    if (N00.getOpcode() != ISD::OR)
-      return SDValue();
-    SDValue N000 = N00.getOperand(0);
-    if (!isBSwapHWordElement(N000, Parts))
-      return SDValue();
-    SDValue N001 = N00.getOperand(1);
-    if (!isBSwapHWordElement(N001, Parts))
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+    if (!(isBSwapHWordElement(N01, Parts) && isBSwapHWordPair(N00, Parts)) &&
+        !(isBSwapHWordElement(N00, Parts) && isBSwapHWordPair(N01, Parts)))
       return SDValue();
-  }
+  } else
+    return SDValue();
 
   // Make sure the parts are all coming from the same node.
   if (Parts[0] != Parts[1] || Parts[0] != Parts[2] || Parts[0] != Parts[3])
@@ -5791,15 +5751,11 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
           SDValue NewLHS = ZeroN00 ? N0.getOperand(1) : N0.getOperand(0);
           SDValue NewRHS = ZeroN10 ? N1.getOperand(1) : N1.getOperand(0);
 
-          bool LegalMask = TLI.isShuffleMaskLegal(Mask, VT);
-          if (!LegalMask) {
-            std::swap(NewLHS, NewRHS);
-            ShuffleVectorSDNode::commuteMask(Mask);
-            LegalMask = TLI.isShuffleMaskLegal(Mask, VT);
-          }
-
-          if (LegalMask)
-            return DAG.getVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS, Mask);
+          SDValue LegalShuffle =
+              TLI.buildLegalVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS,
+                                          Mask, DAG);
+          if (LegalShuffle)
+            return LegalShuffle;
         }
       }
     }
@@ -5867,8 +5823,8 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
       return V;
 
   // See if this is some rotate idiom.
-  if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))
-    return SDValue(Rot, 0);
+  if (SDValue Rot = MatchRotate(N0, N1, SDLoc(N)))
+    return Rot;
 
   if (SDValue Load = MatchLoadCombine(N))
     return Load;
@@ -5914,6 +5870,9 @@ static bool matchRotateHalf(SelectionDAG &DAG, SDValue Op, SDValue &Shift,
 /// Otherwise, returns an expansion of \p ExtractFrom based on the following
 /// patterns:
 ///
+///   (or (add v v) (shrl v bitwidth-1)):
+///     expands (add v v) -> (shl v 1)
+///
 ///   (or (mul v c0) (shrl (mul v c1) c2)):
 ///     expands (mul v c0) -> (shl (mul v c1) c3)
 ///
@@ -5936,6 +5895,23 @@ static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift,
       "Existing shift must be valid as a rotate half");
 
   ExtractFrom = stripConstantMask(DAG, ExtractFrom, Mask);
+
+  // Value and Type of the shift.
+  SDValue OppShiftLHS = OppShift.getOperand(0);
+  EVT ShiftedVT = OppShiftLHS.getValueType();
+
+  // Amount of the existing shift.
+  ConstantSDNode *OppShiftCst = isConstOrConstSplat(OppShift.getOperand(1));
+
+  // (add v v) -> (shl v 1)
+  if (OppShift.getOpcode() == ISD::SRL && OppShiftCst &&
+      ExtractFrom.getOpcode() == ISD::ADD &&
+      ExtractFrom.getOperand(0) == ExtractFrom.getOperand(1) &&
+      ExtractFrom.getOperand(0) == OppShiftLHS &&
+      OppShiftCst->getAPIntValue() == ShiftedVT.getScalarSizeInBits() - 1)
+    return DAG.getNode(ISD::SHL, DL, ShiftedVT, OppShiftLHS,
+                       DAG.getShiftAmountConstant(1, ShiftedVT, DL));
+
   // Preconditions:
   //    (or (op0 v c0) (shiftl/r (op0 v c1) c2))
   //
@@ -5959,15 +5935,11 @@ static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift,
 
   // op0 must be the same opcode on both sides, have the same LHS argument,
   // and produce the same value type.
-  SDValue OppShiftLHS = OppShift.getOperand(0);
-  EVT ShiftedVT = OppShiftLHS.getValueType();
   if (OppShiftLHS.getOpcode() != ExtractFrom.getOpcode() ||
       OppShiftLHS.getOperand(0) != ExtractFrom.getOperand(0) ||
       ShiftedVT != ExtractFrom.getValueType())
     return SDValue();
 
-  // Amount of the existing shift.
-  ConstantSDNode *OppShiftCst = isConstOrConstSplat(OppShift.getOperand(1));
   // Constant mul/udiv/shift amount from the RHS of the shift's LHS op.
   ConstantSDNode *OppLHSCst = isConstOrConstSplat(OppShiftLHS.getOperand(1));
   // Constant mul/udiv/shift amount from the RHS of the ExtractFrom op.
@@ -6137,7 +6109,7 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
 // to both (PosOpcode Shifted, Pos) and (NegOpcode Shifted, Neg), with the
 // former being preferred if supported.  InnerPos and InnerNeg are Pos and
 // Neg with outer conversions stripped away.
-SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
+SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
                                        SDValue Neg, SDValue InnerPos,
                                        SDValue InnerNeg, unsigned PosOpcode,
                                        unsigned NegOpcode, const SDLoc &DL) {
@@ -6152,32 +6124,33 @@ SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
   if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits(), DAG)) {
     bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
     return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
-                       HasPos ? Pos : Neg).getNode();
+                       HasPos ? Pos : Neg);
   }
 
-  return nullptr;
+  return SDValue();
 }
 
 // MatchRotate - Handle an 'or' of two operands.  If this is one of the many
 // idioms for rotate, and if the target supports rotation instructions, generate
 // a rot[lr].
-SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
+SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
   // Must be a legal type.  Expanded 'n promoted things won't work with rotates.
   EVT VT = LHS.getValueType();
-  if (!TLI.isTypeLegal(VT)) return nullptr;
+  if (!TLI.isTypeLegal(VT))
+    return SDValue();
 
   // The target must have at least one rotate flavor.
   bool HasROTL = hasOperation(ISD::ROTL, VT);
   bool HasROTR = hasOperation(ISD::ROTR, VT);
-  if (!HasROTL && !HasROTR) return nullptr;
+  if (!HasROTL && !HasROTR)
+    return SDValue();
 
   // Check for truncated rotate.
   if (LHS.getOpcode() == ISD::TRUNCATE && RHS.getOpcode() == ISD::TRUNCATE &&
       LHS.getOperand(0).getValueType() == RHS.getOperand(0).getValueType()) {
     assert(LHS.getValueType() == RHS.getValueType());
-    if (SDNode *Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) {
-      return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(),
-                         SDValue(Rot, 0)).getNode();
+    if (SDValue Rot = MatchRotate(LHS.getOperand(0), RHS.getOperand(0), DL)) {
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(LHS), LHS.getValueType(), Rot);
     }
   }
 
@@ -6192,7 +6165,7 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
 
   // If neither side matched a rotate half, bail
   if (!LHSShift && !RHSShift)
-    return nullptr;
+    return SDValue();
 
   // InstCombine may have combined a constant shl, srl, mul, or udiv with one
   // side of the rotate, so try to handle that here. In all cases we need to
@@ -6215,15 +6188,15 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
 
   // If a side is still missing, nothing else we can do.
   if (!RHSShift || !LHSShift)
-    return nullptr;
+    return SDValue();
 
   // At this point we've matched or extracted a shift op on each side.
 
   if (LHSShift.getOperand(0) != RHSShift.getOperand(0))
-    return nullptr;   // Not shifting the same value.
+    return SDValue(); // Not shifting the same value.
 
   if (LHSShift.getOpcode() == RHSShift.getOpcode())
-    return nullptr;   // Shifts must disagree.
+    return SDValue(); // Shifts must disagree.
 
   // Canonicalize shl to left side in a shl/srl pair.
   if (RHSShift.getOpcode() == ISD::SHL) {
@@ -6267,13 +6240,13 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
       Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask);
     }
 
-    return Rot.getNode();
+    return Rot;
   }
 
   // If there is a mask here, and we have a variable shift, we can't be sure
   // that we're masking out the right stuff.
   if (LHSMask.getNode() || RHSMask.getNode())
-    return nullptr;
+    return SDValue();
 
   // If the shift amount is sign/zext/any-extended just peel it off.
   SDValue LExtOp0 = LHSShiftAmt;
@@ -6290,17 +6263,17 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
     RExtOp0 = RHSShiftAmt.getOperand(0);
   }
 
-  SDNode *TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt,
+  SDValue TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt,
                                    LExtOp0, RExtOp0, ISD::ROTL, ISD::ROTR, DL);
   if (TryL)
     return TryL;
 
-  SDNode *TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
+  SDValue TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
                                    RExtOp0, LExtOp0, ISD::ROTR, ISD::ROTL, DL);
   if (TryR)
     return TryR;
 
-  return nullptr;
+  return SDValue();
 }
 
 namespace {
@@ -6415,7 +6388,7 @@ calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth,
                                  Depth + 1);
   case ISD::LOAD: {
     auto L = cast<LoadSDNode>(Op.getNode());
-    if (L->isVolatile() || L->isIndexed())
+    if (!L->isSimple() || L->isIndexed())
       return None;
 
     unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
@@ -6504,8 +6477,9 @@ SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) {
   SDValue Chain;
   SmallVector<StoreSDNode *, 8> Stores;
   for (StoreSDNode *Store = N; Store; Store = dyn_cast<StoreSDNode>(Chain)) {
+    // TODO: Allow unordered atomics when wider type is legal (see D66309)
     if (Store->getMemoryVT() != MVT::i8 ||
-        Store->isVolatile() || Store->isIndexed())
+        !Store->isSimple() || Store->isIndexed())
       return SDValue();
     Stores.push_back(Store);
     Chain = Store->getChain();
@@ -6716,7 +6690,8 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
       return SDValue();
 
     LoadSDNode *L = P->Load;
-    assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() &&
+    assert(L->hasNUsesOfValue(1, 0) && L->isSimple() &&
+           !L->isIndexed() &&
            "Must be enforced by calculateByteProvider");
     assert(L->getOffset().isUndef() && "Unindexed load must have undef offset");
 
@@ -6958,25 +6933,25 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
   if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() &&
       (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) {
-    SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
-    if (isOneUseSetCC(RHS) || isOneUseSetCC(LHS)) {
+    SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1);
+    if (isOneUseSetCC(N01) || isOneUseSetCC(N00)) {
       unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND;
-      LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS
-      RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS
-      AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode());
-      return DAG.getNode(NewOpcode, DL, VT, LHS, RHS);
+      N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00
+      N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01
+      AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode());
+      return DAG.getNode(NewOpcode, DL, VT, N00, N01);
     }
   }
   // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants
   if (isAllOnesConstant(N1) && N0.hasOneUse() &&
       (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) {
-    SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
-    if (isa<ConstantSDNode>(RHS) || isa<ConstantSDNode>(LHS)) {
+    SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1);
+    if (isa<ConstantSDNode>(N01) || isa<ConstantSDNode>(N00)) {
       unsigned NewOpcode = N0Opcode == ISD::AND ? ISD::OR : ISD::AND;
-      LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS
-      RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS
-      AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode());
-      return DAG.getNode(NewOpcode, DL, VT, LHS, RHS);
+      N00 = DAG.getNode(ISD::XOR, SDLoc(N00), VT, N00, N1); // N00 = ~N00
+      N01 = DAG.getNode(ISD::XOR, SDLoc(N01), VT, N01, N1); // N01 = ~N01
+      AddToWorklist(N00.getNode()); AddToWorklist(N01.getNode());
+      return DAG.getNode(NewOpcode, DL, VT, N00, N01);
     }
   }
 
@@ -7079,26 +7054,103 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   return SDValue();
 }
 
+/// If we have a shift-by-constant of a bitwise logic op that itself has a
+/// shift-by-constant operand with identical opcode, we may be able to convert
+/// that into 2 independent shifts followed by the logic op. This is a
+/// throughput improvement.
+static SDValue combineShiftOfShiftedLogic(SDNode *Shift, SelectionDAG &DAG) {
+  // Match a one-use bitwise logic op.
+  SDValue LogicOp = Shift->getOperand(0);
+  if (!LogicOp.hasOneUse())
+    return SDValue();
+
+  unsigned LogicOpcode = LogicOp.getOpcode();
+  if (LogicOpcode != ISD::AND && LogicOpcode != ISD::OR &&
+      LogicOpcode != ISD::XOR)
+    return SDValue();
+
+  // Find a matching one-use shift by constant.
+  unsigned ShiftOpcode = Shift->getOpcode();
+  SDValue C1 = Shift->getOperand(1);
+  ConstantSDNode *C1Node = isConstOrConstSplat(C1);
+  assert(C1Node && "Expected a shift with constant operand");
+  const APInt &C1Val = C1Node->getAPIntValue();
+  auto matchFirstShift = [&](SDValue V, SDValue &ShiftOp,
+                             const APInt *&ShiftAmtVal) {
+    if (V.getOpcode() != ShiftOpcode || !V.hasOneUse())
+      return false;
+
+    ConstantSDNode *ShiftCNode = isConstOrConstSplat(V.getOperand(1));
+    if (!ShiftCNode)
+      return false;
+
+    // Capture the shifted operand and shift amount value.
+    ShiftOp = V.getOperand(0);
+    ShiftAmtVal = &ShiftCNode->getAPIntValue();
+
+    // Shift amount types do not have to match their operand type, so check that
+    // the constants are the same width.
+    if (ShiftAmtVal->getBitWidth() != C1Val.getBitWidth())
+      return false;
+
+    // The fold is not valid if the sum of the shift values exceeds bitwidth.
+    if ((*ShiftAmtVal + C1Val).uge(V.getScalarValueSizeInBits()))
+      return false;
+
+    return true;
+  };
+
+  // Logic ops are commutative, so check each operand for a match.
+  SDValue X, Y;
+  const APInt *C0Val;
+  if (matchFirstShift(LogicOp.getOperand(0), X, C0Val))
+    Y = LogicOp.getOperand(1);
+  else if (matchFirstShift(LogicOp.getOperand(1), X, C0Val))
+    Y = LogicOp.getOperand(0);
+  else
+    return SDValue();
+
+  // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1)
+  SDLoc DL(Shift);
+  EVT VT = Shift->getValueType(0);
+  EVT ShiftAmtVT = Shift->getOperand(1).getValueType();
+  SDValue ShiftSumC = DAG.getConstant(*C0Val + C1Val, DL, ShiftAmtVT);
+  SDValue NewShift1 = DAG.getNode(ShiftOpcode, DL, VT, X, ShiftSumC);
+  SDValue NewShift2 = DAG.getNode(ShiftOpcode, DL, VT, Y, C1);
+  return DAG.getNode(LogicOpcode, DL, VT, NewShift1, NewShift2);
+}
+
 /// Handle transforms common to the three shifts, when the shift amount is a
 /// constant.
 /// We are looking for: (shift being one of shl/sra/srl)
 ///   shift (binop X, C0), C1
 /// And want to transform into:
 ///   binop (shift X, C1), (shift C0, C1)
-SDValue DAGCombiner::visitShiftByConstant(SDNode *N, ConstantSDNode *Amt) {
+SDValue DAGCombiner::visitShiftByConstant(SDNode *N) {
+  assert(isConstOrConstSplat(N->getOperand(1)) && "Expected constant operand");
+
   // Do not turn a 'not' into a regular xor.
   if (isBitwiseNot(N->getOperand(0)))
     return SDValue();
 
   // The inner binop must be one-use, since we want to replace it.
-  SDNode *LHS = N->getOperand(0).getNode();
-  if (!LHS->hasOneUse()) return SDValue();
+  SDValue LHS = N->getOperand(0);
+  if (!LHS.hasOneUse() || !TLI.isDesirableToCommuteWithShift(N, Level))
+    return SDValue();
+
+  // TODO: This is limited to early combining because it may reveal regressions
+  //       otherwise. But since we just checked a target hook to see if this is
+  //       desirable, that should have filtered out cases where this interferes
+  //       with some other pattern matching.
+  if (!LegalTypes)
+    if (SDValue R = combineShiftOfShiftedLogic(N, DAG))
+      return R;
 
   // We want to pull some binops through shifts, so that we have (and (shift))
   // instead of (shift (and)), likewise for add, or, xor, etc.  This sort of
   // thing happens with address calculations, so it's important to canonicalize
   // it.
-  switch (LHS->getOpcode()) {
+  switch (LHS.getOpcode()) {
   default:
     return SDValue();
   case ISD::OR:
@@ -7112,14 +7164,14 @@ SDValue DAGCombiner::visitShiftByConstant(SDNode *N, ConstantSDNode *Amt) {
   }
 
   // We require the RHS of the binop to be a constant and not opaque as well.
-  ConstantSDNode *BinOpCst = getAsNonOpaqueConstant(LHS->getOperand(1));
+  ConstantSDNode *BinOpCst = getAsNonOpaqueConstant(LHS.getOperand(1));
   if (!BinOpCst)
     return SDValue();
 
   // FIXME: disable this unless the input to the binop is a shift by a constant
   // or is copy/select. Enable this in other cases when figure out it's exactly
   // profitable.
-  SDValue BinOpLHSVal = LHS->getOperand(0);
+  SDValue BinOpLHSVal = LHS.getOperand(0);
   bool IsShiftByConstant = (BinOpLHSVal.getOpcode() == ISD::SHL ||
                             BinOpLHSVal.getOpcode() == ISD::SRA ||
                             BinOpLHSVal.getOpcode() == ISD::SRL) &&
@@ -7133,24 +7185,16 @@ SDValue DAGCombiner::visitShiftByConstant(SDNode *N, ConstantSDNode *Amt) {
   if (IsCopyOrSelect && N->hasOneUse())
     return SDValue();
 
-  EVT VT = N->getValueType(0);
-
-  if (!TLI.isDesirableToCommuteWithShift(N, Level))
-    return SDValue();
-
   // Fold the constants, shifting the binop RHS by the shift amount.
-  SDValue NewRHS = DAG.getNode(N->getOpcode(), SDLoc(LHS->getOperand(1)),
-                               N->getValueType(0),
-                               LHS->getOperand(1), N->getOperand(1));
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue NewRHS = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(1),
+                               N->getOperand(1));
   assert(isa<ConstantSDNode>(NewRHS) && "Folding was not successful!");
 
-  // Create the new shift.
-  SDValue NewShift = DAG.getNode(N->getOpcode(),
-                                 SDLoc(LHS->getOperand(0)),
-                                 VT, LHS->getOperand(0), N->getOperand(1));
-
-  // Create the new binop.
-  return DAG.getNode(LHS->getOpcode(), SDLoc(N), VT, NewShift, NewRHS);
+  SDValue NewShift = DAG.getNode(N->getOpcode(), DL, VT, LHS.getOperand(0),
+                                 N->getOperand(1));
+  return DAG.getNode(LHS.getOpcode(), DL, VT, NewShift, NewRHS);
 }
 
 SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) {
@@ -7478,7 +7522,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   }
 
   if (N1C && !N1C->isOpaque())
-    if (SDValue NewSHL = visitShiftByConstant(N, N1C))
+    if (SDValue NewSHL = visitShiftByConstant(N))
       return NewSHL;
 
   return SDValue();
@@ -7597,6 +7641,37 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
     }
   }
 
+  // We convert trunc/ext to opposing shifts in IR, but casts may be cheaper.
+  //   sra (add (shl X, N1C), AddC), N1C -->
+  //   sext (add (trunc X to (width - N1C)), AddC')
+  if (!LegalTypes && N0.getOpcode() == ISD::ADD && N0.hasOneUse() && N1C &&
+      N0.getOperand(0).getOpcode() == ISD::SHL &&
+      N0.getOperand(0).getOperand(1) == N1 && N0.getOperand(0).hasOneUse()) {
+    if (ConstantSDNode *AddC = isConstOrConstSplat(N0.getOperand(1))) {
+      SDValue Shl = N0.getOperand(0);
+      // Determine what the truncate's type would be and ask the target if that
+      // is a free operation.
+      LLVMContext &Ctx = *DAG.getContext();
+      unsigned ShiftAmt = N1C->getZExtValue();
+      EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - ShiftAmt);
+      if (VT.isVector())
+        TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorNumElements());
+
+      // TODO: The simple type check probably belongs in the default hook
+      //       implementation and/or target-specific overrides (because
+      //       non-simple types likely require masking when legalized), but that
+      //       restriction may conflict with other transforms.
+      if (TruncVT.isSimple() && TLI.isTruncateFree(VT, TruncVT)) {
+        SDLoc DL(N);
+        SDValue Trunc = DAG.getZExtOrTrunc(Shl.getOperand(0), DL, TruncVT);
+        SDValue ShiftC = DAG.getConstant(AddC->getAPIntValue().lshr(ShiftAmt).
+                             trunc(TruncVT.getScalarSizeInBits()), DL, TruncVT);
+        SDValue Add = DAG.getNode(ISD::ADD, DL, TruncVT, Trunc, ShiftC);
+        return DAG.getSExtOrTrunc(Add, DL, VT);
+      }
+    }
+  }
+
   // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
   if (N1.getOpcode() == ISD::TRUNCATE &&
       N1.getOperand(0).getOpcode() == ISD::AND) {
@@ -7638,7 +7713,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
     return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1);
 
   if (N1C && !N1C->isOpaque())
-    if (SDValue NewSRA = visitShiftByConstant(N, N1C))
+    if (SDValue NewSRA = visitShiftByConstant(N))
       return NewSRA;
 
   return SDValue();
@@ -7819,7 +7894,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
     return SDValue(N, 0);
 
   if (N1C && !N1C->isOpaque())
-    if (SDValue NewSRL = visitShiftByConstant(N, N1C))
+    if (SDValue NewSRL = visitShiftByConstant(N))
       return NewSRL;
 
   // Attempt to convert a srl of a load into a narrower zero-extending load.
@@ -8100,6 +8175,43 @@ static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
   }
 }
 
+/// If a (v)select has a condition value that is a sign-bit test, try to smear
+/// the condition operand sign-bit across the value width and use it as a mask.
+static SDValue foldSelectOfConstantsUsingSra(SDNode *N, SelectionDAG &DAG) {
+  SDValue Cond = N->getOperand(0);
+  SDValue C1 = N->getOperand(1);
+  SDValue C2 = N->getOperand(2);
+  assert(isConstantOrConstantVector(C1) && isConstantOrConstantVector(C2) &&
+         "Expected select-of-constants");
+
+  EVT VT = N->getValueType(0);
+  if (Cond.getOpcode() != ISD::SETCC || !Cond.hasOneUse() ||
+      VT != Cond.getOperand(0).getValueType())
+    return SDValue();
+
+  // The inverted-condition + commuted-select variants of these patterns are
+  // canonicalized to these forms in IR.
+  SDValue X = Cond.getOperand(0);
+  SDValue CondC = Cond.getOperand(1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+  if (CC == ISD::SETGT && isAllOnesOrAllOnesSplat(CondC) &&
+      isAllOnesOrAllOnesSplat(C2)) {
+    // i32 X > -1 ? C1 : -1 --> (X >>s 31) | C1
+    SDLoc DL(N);
+    SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT);
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC);
+    return DAG.getNode(ISD::OR, DL, VT, Sra, C1);
+  }
+  if (CC == ISD::SETLT && isNullOrNullSplat(CondC) && isNullOrNullSplat(C2)) {
+    // i8 X < 0 ? C1 : 0 --> (X >>s 7) & C1
+    SDLoc DL(N);
+    SDValue ShAmtC = DAG.getConstant(X.getScalarValueSizeInBits() - 1, DL, VT);
+    SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShAmtC);
+    return DAG.getNode(ISD::AND, DL, VT, Sra, C1);
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
   SDValue Cond = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -8148,22 +8260,36 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
       return Cond;
     }
 
-    // For any constants that differ by 1, we can transform the select into an
-    // extend and add. Use a target hook because some targets may prefer to
-    // transform in the other direction.
+    // Use a target hook because some targets may prefer to transform in the
+    // other direction.
     if (TLI.convertSelectOfConstantsToMath(VT)) {
-      if (C1->getAPIntValue() - 1 == C2->getAPIntValue()) {
+      // For any constants that differ by 1, we can transform the select into an
+      // extend and add.
+      const APInt &C1Val = C1->getAPIntValue();
+      const APInt &C2Val = C2->getAPIntValue();
+      if (C1Val - 1 == C2Val) {
         // select Cond, C1, C1-1 --> add (zext Cond), C1-1
         if (VT != MVT::i1)
           Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
         return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
       }
-      if (C1->getAPIntValue() + 1 == C2->getAPIntValue()) {
+      if (C1Val + 1 == C2Val) {
         // select Cond, C1, C1+1 --> add (sext Cond), C1+1
         if (VT != MVT::i1)
           Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
         return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
       }
+
+      // select Cond, Pow2, 0 --> (zext Cond) << log2(Pow2)
+      if (C1Val.isPowerOf2() && C2Val.isNullValue()) {
+        if (VT != MVT::i1)
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
+        SDValue ShAmtC = DAG.getConstant(C1Val.exactLogBase2(), DL, VT);
+        return DAG.getNode(ISD::SHL, DL, VT, Cond, ShAmtC);
+      }
+
+      if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG))
+        return V;
     }
 
     return SDValue();
@@ -8381,23 +8507,6 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   return SDValue();
 }
 
-static
-std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N, SelectionDAG &DAG) {
-  SDLoc DL(N);
-  EVT LoVT, HiVT;
-  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
-
-  // Split the inputs.
-  SDValue Lo, Hi, LL, LH, RL, RH;
-  std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
-  std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
-
-  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
-  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
-
-  return std::make_pair(Lo, Hi);
-}
-
 // This function assumes all the vselect's arguments are CONCAT_VECTOR
 // nodes and that the condition is a BV of ConstantSDNodes (or undefs).
 static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
@@ -8456,7 +8565,6 @@ static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
 SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
   MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
   SDValue Mask = MSC->getMask();
-  SDValue Data = MSC->getValue();
   SDValue Chain = MSC->getChain();
   SDLoc DL(N);
 
@@ -8464,123 +8572,19 @@ SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
   if (ISD::isBuildVectorAllZeros(Mask.getNode()))
     return Chain;
 
-  if (Level >= AfterLegalizeTypes)
-    return SDValue();
-
-  // If the MSCATTER data type requires splitting and the mask is provided by a
-  // SETCC, then split both nodes and its operands before legalization. This
-  // prevents the type legalizer from unrolling SETCC into scalar comparisons
-  // and enables future optimizations (e.g. min/max pattern matching on X86).
-  if (Mask.getOpcode() != ISD::SETCC)
-    return SDValue();
-
-  // Check if any splitting is required.
-  if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) !=
-      TargetLowering::TypeSplitVector)
-    return SDValue();
-  SDValue MaskLo, MaskHi;
-  std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
-
-  EVT LoVT, HiVT;
-  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MSC->getValueType(0));
-
-  EVT MemoryVT = MSC->getMemoryVT();
-  unsigned Alignment = MSC->getOriginalAlignment();
-
-  EVT LoMemVT, HiMemVT;
-  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
-
-  SDValue DataLo, DataHi;
-  std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
-
-  SDValue Scale = MSC->getScale();
-  SDValue BasePtr = MSC->getBasePtr();
-  SDValue IndexLo, IndexHi;
-  std::tie(IndexLo, IndexHi) = DAG.SplitVector(MSC->getIndex(), DL);
-
-  MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MSC->getPointerInfo(),
-                          MachineMemOperand::MOStore,  LoMemVT.getStoreSize(),
-                          Alignment, MSC->getAAInfo(), MSC->getRanges());
-
-  SDValue OpsLo[] = { Chain, DataLo, MaskLo, BasePtr, IndexLo, Scale };
-  SDValue Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
-                                    DataLo.getValueType(), DL, OpsLo, MMO);
-
-  // The order of the Scatter operation after split is well defined. The "Hi"
-  // part comes after the "Lo". So these two operations should be chained one
-  // after another.
-  SDValue OpsHi[] = { Lo, DataHi, MaskHi, BasePtr, IndexHi, Scale };
-  return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
-                              DL, OpsHi, MMO);
+  return SDValue();
 }
 
 SDValue DAGCombiner::visitMSTORE(SDNode *N) {
   MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
   SDValue Mask = MST->getMask();
-  SDValue Data = MST->getValue();
   SDValue Chain = MST->getChain();
-  EVT VT = Data.getValueType();
   SDLoc DL(N);
 
   // Zap masked stores with a zero mask.
   if (ISD::isBuildVectorAllZeros(Mask.getNode()))
     return Chain;
 
-  if (Level >= AfterLegalizeTypes)
-    return SDValue();
-
-  // If the MSTORE data type requires splitting and the mask is provided by a
-  // SETCC, then split both nodes and its operands before legalization. This
-  // prevents the type legalizer from unrolling SETCC into scalar comparisons
-  // and enables future optimizations (e.g. min/max pattern matching on X86).
-  if (Mask.getOpcode() == ISD::SETCC) {
-    // Check if any splitting is required.
-    if (TLI.getTypeAction(*DAG.getContext(), VT) !=
-        TargetLowering::TypeSplitVector)
-      return SDValue();
-
-    SDValue MaskLo, MaskHi, Lo, Hi;
-    std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
-
-    SDValue Ptr   = MST->getBasePtr();
-
-    EVT MemoryVT = MST->getMemoryVT();
-    unsigned Alignment = MST->getOriginalAlignment();
-
-    EVT LoMemVT, HiMemVT;
-    std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
-
-    SDValue DataLo, DataHi;
-    std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
-
-    MachineMemOperand *MMO = DAG.getMachineFunction().
-      getMachineMemOperand(MST->getPointerInfo(),
-                           MachineMemOperand::MOStore,  LoMemVT.getStoreSize(),
-                           Alignment, MST->getAAInfo(), MST->getRanges());
-
-    Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
-                            MST->isTruncatingStore(),
-                            MST->isCompressingStore());
-
-    Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
-                                     MST->isCompressingStore());
-    unsigned HiOffset = LoMemVT.getStoreSize();
-
-    MMO = DAG.getMachineFunction().getMachineMemOperand(
-        MST->getPointerInfo().getWithOffset(HiOffset),
-        MachineMemOperand::MOStore, HiMemVT.getStoreSize(), Alignment,
-        MST->getAAInfo(), MST->getRanges());
-
-    Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
-                            MST->isTruncatingStore(),
-                            MST->isCompressingStore());
-
-    AddToWorklist(Lo.getNode());
-    AddToWorklist(Hi.getNode());
-
-    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
-  }
   return SDValue();
 }
 
@@ -8593,76 +8597,7 @@ SDValue DAGCombiner::visitMGATHER(SDNode *N) {
   if (ISD::isBuildVectorAllZeros(Mask.getNode()))
     return CombineTo(N, MGT->getPassThru(), MGT->getChain());
 
-  if (Level >= AfterLegalizeTypes)
-    return SDValue();
-
-  // If the MGATHER result requires splitting and the mask is provided by a
-  // SETCC, then split both nodes and its operands before legalization. This
-  // prevents the type legalizer from unrolling SETCC into scalar comparisons
-  // and enables future optimizations (e.g. min/max pattern matching on X86).
-
-  if (Mask.getOpcode() != ISD::SETCC)
-    return SDValue();
-
-  EVT VT = N->getValueType(0);
-
-  // Check if any splitting is required.
-  if (TLI.getTypeAction(*DAG.getContext(), VT) !=
-      TargetLowering::TypeSplitVector)
-    return SDValue();
-
-  SDValue MaskLo, MaskHi, Lo, Hi;
-  std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
-
-  SDValue PassThru = MGT->getPassThru();
-  SDValue PassThruLo, PassThruHi;
-  std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, DL);
-
-  EVT LoVT, HiVT;
-  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
-
-  SDValue Chain = MGT->getChain();
-  EVT MemoryVT = MGT->getMemoryVT();
-  unsigned Alignment = MGT->getOriginalAlignment();
-
-  EVT LoMemVT, HiMemVT;
-  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
-
-  SDValue Scale = MGT->getScale();
-  SDValue BasePtr = MGT->getBasePtr();
-  SDValue Index = MGT->getIndex();
-  SDValue IndexLo, IndexHi;
-  std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
-
-  MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MGT->getPointerInfo(),
-                          MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
-                          Alignment, MGT->getAAInfo(), MGT->getRanges());
-
-  SDValue OpsLo[] = { Chain, PassThruLo, MaskLo, BasePtr, IndexLo, Scale };
-  Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, DL, OpsLo,
-                           MMO);
-
-  SDValue OpsHi[] = { Chain, PassThruHi, MaskHi, BasePtr, IndexHi, Scale };
-  Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, DL, OpsHi,
-                           MMO);
-
-  AddToWorklist(Lo.getNode());
-  AddToWorklist(Hi.getNode());
-
-  // Build a factor node to remember that this load is independent of the
-  // other one.
-  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
-                      Hi.getValue(1));
-
-  // Legalized the chain result - switch anything that used the old chain to
-  // use the new one.
-  DAG.ReplaceAllUsesOfValueWith(SDValue(MGT, 1), Chain);
-
-  SDValue GatherRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
-
-  SDValue RetOps[] = { GatherRes, Chain };
-  return DAG.getMergeValues(RetOps, DL);
+  return SDValue();
 }
 
 SDValue DAGCombiner::visitMLOAD(SDNode *N) {
@@ -8674,76 +8609,6 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
   if (ISD::isBuildVectorAllZeros(Mask.getNode()))
     return CombineTo(N, MLD->getPassThru(), MLD->getChain());
 
-  if (Level >= AfterLegalizeTypes)
-    return SDValue();
-
-  // If the MLOAD result requires splitting and the mask is provided by a
-  // SETCC, then split both nodes and its operands before legalization. This
-  // prevents the type legalizer from unrolling SETCC into scalar comparisons
-  // and enables future optimizations (e.g. min/max pattern matching on X86).
-  if (Mask.getOpcode() == ISD::SETCC) {
-    EVT VT = N->getValueType(0);
-
-    // Check if any splitting is required.
-    if (TLI.getTypeAction(*DAG.getContext(), VT) !=
-        TargetLowering::TypeSplitVector)
-      return SDValue();
-
-    SDValue MaskLo, MaskHi, Lo, Hi;
-    std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
-
-    SDValue PassThru = MLD->getPassThru();
-    SDValue PassThruLo, PassThruHi;
-    std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, DL);
-
-    EVT LoVT, HiVT;
-    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
-
-    SDValue Chain = MLD->getChain();
-    SDValue Ptr   = MLD->getBasePtr();
-    EVT MemoryVT = MLD->getMemoryVT();
-    unsigned Alignment = MLD->getOriginalAlignment();
-
-    EVT LoMemVT, HiMemVT;
-    std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
-
-    MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MLD->getPointerInfo(),
-                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
-                         Alignment, MLD->getAAInfo(), MLD->getRanges());
-
-    Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, PassThruLo, LoMemVT,
-                           MMO, ISD::NON_EXTLOAD, MLD->isExpandingLoad());
-
-    Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
-                                     MLD->isExpandingLoad());
-    unsigned HiOffset = LoMemVT.getStoreSize();
-
-    MMO = DAG.getMachineFunction().getMachineMemOperand(
-        MLD->getPointerInfo().getWithOffset(HiOffset),
-        MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), Alignment,
-        MLD->getAAInfo(), MLD->getRanges());
-
-    Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, PassThruHi, HiMemVT,
-                           MMO, ISD::NON_EXTLOAD, MLD->isExpandingLoad());
-
-    AddToWorklist(Lo.getNode());
-    AddToWorklist(Hi.getNode());
-
-    // Build a factor node to remember that this load is independent of the
-    // other one.
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
-                        Hi.getValue(1));
-
-    // Legalized the chain result - switch anything that used the old chain to
-    // use the new one.
-    DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), Chain);
-
-    SDValue LoadRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
-
-    SDValue RetOps[] = { LoadRes, Chain };
-    return DAG.getMergeValues(RetOps, DL);
-  }
   return SDValue();
 }
 
@@ -8791,6 +8656,18 @@ SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) {
     return DAG.getNode(ISD::ADD, DL, VT, ExtendedCond, N2);
   }
 
+  // select Cond, Pow2C, 0 --> (zext Cond) << log2(Pow2C)
+  APInt Pow2C;
+  if (ISD::isConstantSplatVector(N1.getNode(), Pow2C) && Pow2C.isPowerOf2() &&
+      isNullOrNullSplat(N2)) {
+    SDValue ZextCond = DAG.getZExtOrTrunc(Cond, DL, VT);
+    SDValue ShAmtC = DAG.getConstant(Pow2C.exactLogBase2(), DL, VT);
+    return DAG.getNode(ISD::SHL, DL, VT, ZextCond, ShAmtC);
+  }
+
+  if (SDValue V = foldSelectOfConstantsUsingSra(N, DAG))
+    return V;
+
   // The general case for select-of-constants:
   // vselect <N x i1> Cond, C1, C2 --> xor (and (sext Cond), (C1^C2)), C2
   // ...but that only makes sense if a vselect is slower than 2 logic ops, so
@@ -8832,13 +8709,12 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
       isAbs = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
 
     if (isAbs) {
-      EVT VT = LHS.getValueType();
       if (TLI.isOperationLegalOrCustom(ISD::ABS, VT))
         return DAG.getNode(ISD::ABS, DL, VT, LHS);
 
-      SDValue Shift = DAG.getNode(
-          ISD::SRA, DL, VT, LHS,
-          DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT));
+      SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, LHS,
+                                  DAG.getConstant(VT.getScalarSizeInBits() - 1,
+                                                  DL, getShiftAmountTy(VT)));
       SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift);
       AddToWorklist(Shift.getNode());
       AddToWorklist(Add.getNode());
@@ -8851,10 +8727,9 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
     // This is OK if we don't care about what happens if either operand is a
     // NaN.
     //
-    if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, N0.getOperand(0),
-                                                       N0.getOperand(1), TLI)) {
-      if (SDValue FMinMax = combineMinNumMaxNum(
-              DL, VT, N0.getOperand(0), N0.getOperand(1), N1, N2, CC, TLI, DAG))
+    if (N0.hasOneUse() && isLegalToCombineMinNumMaxNum(DAG, LHS, RHS, TLI)) {
+      if (SDValue FMinMax =
+              combineMinNumMaxNum(DL, VT, LHS, RHS, N1, N2, CC, TLI, DAG))
         return FMinMax;
     }
 
@@ -9209,8 +9084,9 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
   LoadSDNode *LN0 = cast<LoadSDNode>(N0);
 
   if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) ||
-      !N0.hasOneUse() || LN0->isVolatile() || !DstVT.isVector() ||
-      !DstVT.isPow2VectorType() || !TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
+      !N0.hasOneUse() || !LN0->isSimple() ||
+      !DstVT.isVector() || !DstVT.isPow2VectorType() ||
+      !TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
     return SDValue();
 
   SmallVector<SDNode *, 4> SetCCs;
@@ -9411,7 +9287,8 @@ static SDValue tryToFoldExtOfExtload(SelectionDAG &DAG, DAGCombiner &Combiner,
 
   LoadSDNode *LN0 = cast<LoadSDNode>(N0);
   EVT MemVT = LN0->getMemoryVT();
-  if ((LegalOperations || LN0->isVolatile() || VT.isVector()) &&
+  if ((LegalOperations || !LN0->isSimple() ||
+       VT.isVector()) &&
       !TLI.isLoadExtLegal(ExtLoadType, VT, MemVT))
     return SDValue();
 
@@ -9436,7 +9313,7 @@ static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner,
   if (!ISD::isNON_EXTLoad(N0.getNode()) ||
       !ISD::isUNINDEXEDLoad(N0.getNode()) ||
       ((LegalOperations || VT.isVector() ||
-        cast<LoadSDNode>(N0)->isVolatile()) &&
+        !cast<LoadSDNode>(N0)->isSimple()) &&
        !TLI.isLoadExtLegal(ExtLoadType, VT, N0.getValueType())))
     return {};
 
@@ -9468,6 +9345,35 @@ static SDValue tryToFoldExtOfLoad(SelectionDAG &DAG, DAGCombiner &Combiner,
   return SDValue(N, 0); // Return N so it doesn't get rechecked!
 }
 
+static SDValue tryToFoldExtOfMaskedLoad(SelectionDAG &DAG,
+                                        const TargetLowering &TLI, EVT VT,
+                                        SDNode *N, SDValue N0,
+                                        ISD::LoadExtType ExtLoadType,
+                                        ISD::NodeType ExtOpc) {
+  if (!N0.hasOneUse())
+    return SDValue();
+
+  MaskedLoadSDNode *Ld = dyn_cast<MaskedLoadSDNode>(N0);
+  if (!Ld || Ld->getExtensionType() != ISD::NON_EXTLOAD)
+    return SDValue();
+
+  if (!TLI.isLoadExtLegal(ExtLoadType, VT, Ld->getValueType(0)))
+    return SDValue();
+
+  if (!TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
+    return SDValue();
+
+  SDLoc dl(Ld);
+  SDValue PassThru = DAG.getNode(ExtOpc, dl, VT, Ld->getPassThru());
+  SDValue NewLoad = DAG.getMaskedLoad(VT, dl, Ld->getChain(),
+                                      Ld->getBasePtr(), Ld->getMask(),
+                                      PassThru, Ld->getMemoryVT(),
+                                      Ld->getMemOperand(), ExtLoadType,
+                                      Ld->isExpandingLoad());
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), SDValue(NewLoad.getNode(), 1));
+  return NewLoad;
+}
+
 static SDValue foldExtendedSignBitTest(SDNode *N, SelectionDAG &DAG,
                                        bool LegalOperations) {
   assert((N->getOpcode() == ISD::SIGN_EXTEND ||
@@ -9568,6 +9474,11 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
                              ISD::SEXTLOAD, ISD::SIGN_EXTEND))
     return foldedExt;
 
+  if (SDValue foldedExt =
+      tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::SEXTLOAD,
+                               ISD::SIGN_EXTEND))
+    return foldedExt;
+
   // fold (sext (load x)) to multiple smaller sextloads.
   // Only on illegal but splittable vectors.
   if (SDValue ExtLoad = CombineExtLoad(N))
@@ -9856,6 +9767,11 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                              ISD::ZEXTLOAD, ISD::ZERO_EXTEND))
     return foldedExt;
 
+  if (SDValue foldedExt =
+      tryToFoldExtOfMaskedLoad(DAG, TLI, VT, N, N0, ISD::ZEXTLOAD,
+                               ISD::ZERO_EXTEND))
+    return foldedExt;
+
   // fold (zext (load x)) to multiple smaller zextloads.
   // Only on illegal but splittable vectors.
   if (SDValue ExtLoad = CombineExtLoad(N))
@@ -10340,7 +10256,10 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     return SDValue();
 
   LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-  if (!isLegalNarrowLdSt(LN0, ExtType, ExtVT, ShAmt))
+  // Reducing the width of a volatile load is illegal.  For atomics, we may be
+  // able to reduce the width provided we never widen again. (see D66309)  
+  if (!LN0->isSimple() ||
+      !isLegalNarrowLdSt(LN0, ExtType, ExtVT, ShAmt))
     return SDValue();
 
   auto AdjustBigEndianShift = [&](unsigned ShAmt) {
@@ -10369,11 +10288,11 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
 
   SDValue Load;
   if (ExtType == ISD::NON_EXTLOAD)
-    Load = DAG.getLoad(VT, SDLoc(N0), LN0->getChain(), NewPtr,
+    Load = DAG.getLoad(VT, DL, LN0->getChain(), NewPtr,
                        LN0->getPointerInfo().getWithOffset(PtrOff), NewAlign,
                        LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
   else
-    Load = DAG.getExtLoad(ExtType, SDLoc(N0), VT, LN0->getChain(), NewPtr,
+    Load = DAG.getExtLoad(ExtType, DL, VT, LN0->getChain(), NewPtr,
                           LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT,
                           NewAlign, LN0->getMemOperand()->getFlags(),
                           LN0->getAAInfo());
@@ -10392,7 +10311,6 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
     // no larger than the source) then the useful bits of the result are
     // zero; we can't simply return the shortened shift, because the result
     // of that operation is undefined.
-    SDLoc DL(N0);
     if (ShLeftAmt >= VT.getSizeInBits())
       Result = DAG.getConstant(0, DL, VT);
     else
@@ -10513,7 +10431,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   if (ISD::isEXTLoad(N0.getNode()) &&
       ISD::isUNINDEXEDLoad(N0.getNode()) &&
       EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
-      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile() &&
+      ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple() &&
         N0.hasOneUse()) ||
        TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
@@ -10530,7 +10448,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
       N0.hasOneUse() &&
       EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
-      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+      ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) &&
        TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
@@ -10757,7 +10675,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     // after truncation.
     if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      if (!LN0->isVolatile() &&
+      if (LN0->isSimple() &&
           LN0->getMemoryVT().getStoreSizeInBits() < VT.getSizeInBits()) {
         SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0),
                                          VT, LN0->getChain(), LN0->getBasePtr(),
@@ -11051,7 +10969,7 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
       // memory accesses. We don't care if the original type was legal or not
       // as we assume software couldn't rely on the number of accesses of an
       // illegal type.
-      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+      ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) ||
        TLI.isOperationLegal(ISD::LOAD, VT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
 
@@ -11237,15 +11155,10 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
       for (int i = 0; i != MaskScale; ++i)
         NewMask.push_back(M < 0 ? -1 : M * MaskScale + i);
 
-    bool LegalMask = TLI.isShuffleMaskLegal(NewMask, VT);
-    if (!LegalMask) {
-      std::swap(SV0, SV1);
-      ShuffleVectorSDNode::commuteMask(NewMask);
-      LegalMask = TLI.isShuffleMaskLegal(NewMask, VT);
-    }
-
-    if (LegalMask)
-      return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, NewMask);
+    SDValue LegalShuffle =
+        TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, NewMask, DAG);
+    if (LegalShuffle)
+      return LegalShuffle;
   }
 
   return SDValue();
@@ -11998,7 +11911,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   // N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math)
   ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true);
   if (N1C && N1C->isZero())
-    if (N1C->isNegative() || Options.UnsafeFPMath || Flags.hasNoSignedZeros())
+    if (N1C->isNegative() || Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())
       return N0;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
@@ -12006,17 +11919,17 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
 
   // fold (fadd A, (fneg B)) -> (fsub A, B)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
-      isNegatibleForFree(N1, LegalOperations, TLI, &Options, ForCodeSize) == 2)
-    return DAG.getNode(ISD::FSUB, DL, VT, N0,
-                       GetNegatedExpression(N1, DAG, LegalOperations,
-                                            ForCodeSize), Flags);
+      TLI.isNegatibleForFree(N1, DAG, LegalOperations, ForCodeSize) == 2)
+    return DAG.getNode(
+        ISD::FSUB, DL, VT, N0,
+        TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags);
 
   // fold (fadd (fneg A), B) -> (fsub B, A)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
-      isNegatibleForFree(N0, LegalOperations, TLI, &Options, ForCodeSize) == 2)
-    return DAG.getNode(ISD::FSUB, DL, VT, N1,
-                       GetNegatedExpression(N0, DAG, LegalOperations,
-                                            ForCodeSize), Flags);
+      TLI.isNegatibleForFree(N0, DAG, LegalOperations, ForCodeSize) == 2)
+    return DAG.getNode(
+        ISD::FSUB, DL, VT, N1,
+        TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize), Flags);
 
   auto isFMulNegTwo = [](SDValue FMul) {
     if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL)
@@ -12056,7 +11969,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   // If 'unsafe math' or reassoc and nsz, fold lots of things.
   // TODO: break out portions of the transformations below for which Unsafe is
   //       considered and which do not require both nsz and reassoc
-  if ((Options.UnsafeFPMath ||
+  if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) ||
        (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
       AllowNewConst) {
     // fadd (fadd x, c1), c2 -> fadd x, c1 + c2
@@ -12175,7 +12088,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
 
   // (fsub A, 0) -> A
   if (N1CFP && N1CFP->isZero()) {
-    if (!N1CFP->isNegative() || Options.UnsafeFPMath ||
+    if (!N1CFP->isNegative() || Options.NoSignedZerosFPMath ||
         Flags.hasNoSignedZeros()) {
       return N0;
     }
@@ -12195,16 +12108,16 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   if (N0CFP && N0CFP->isZero()) {
     if (N0CFP->isNegative() ||
         (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) {
-      if (isNegatibleForFree(N1, LegalOperations, TLI, &Options, ForCodeSize))
-        return GetNegatedExpression(N1, DAG, LegalOperations, ForCodeSize);
+      if (TLI.isNegatibleForFree(N1, DAG, LegalOperations, ForCodeSize))
+        return TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize);
       if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
         return DAG.getNode(ISD::FNEG, DL, VT, N1, Flags);
     }
   }
 
-  if ((Options.UnsafeFPMath ||
-      (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros()))
-      && N1.getOpcode() == ISD::FADD) {
+  if (((Options.UnsafeFPMath && Options.NoSignedZerosFPMath) ||
+       (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) &&
+      N1.getOpcode() == ISD::FADD) {
     // X - (X + Y) -> -Y
     if (N0 == N1->getOperand(0))
       return DAG.getNode(ISD::FNEG, DL, VT, N1->getOperand(1), Flags);
@@ -12214,10 +12127,10 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   }
 
   // fold (fsub A, (fneg B)) -> (fadd A, B)
-  if (isNegatibleForFree(N1, LegalOperations, TLI, &Options, ForCodeSize))
-    return DAG.getNode(ISD::FADD, DL, VT, N0,
-                       GetNegatedExpression(N1, DAG, LegalOperations,
-                                            ForCodeSize), Flags);
+  if (TLI.isNegatibleForFree(N1, DAG, LegalOperations, ForCodeSize))
+    return DAG.getNode(
+        ISD::FADD, DL, VT, N0,
+        TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags);
 
   // FSUB -> FMA combines:
   if (SDValue Fused = visitFSUBForFMACombine(N)) {
@@ -12228,6 +12141,21 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   return SDValue();
 }
 
+/// Return true if both inputs are at least as cheap in negated form and at
+/// least one input is strictly cheaper in negated form.
+bool DAGCombiner::isCheaperToUseNegatedFPOps(SDValue X, SDValue Y) {
+  if (char LHSNeg =
+          TLI.isNegatibleForFree(X, DAG, LegalOperations, ForCodeSize))
+    if (char RHSNeg =
+            TLI.isNegatibleForFree(Y, DAG, LegalOperations, ForCodeSize))
+      // Both negated operands are at least as cheap as their counterparts.
+      // Check to see if at least one is cheaper negated.
+      if (LHSNeg == 2 || RHSNeg == 2)
+        return true;
+
+  return false;
+}
+
 SDValue DAGCombiner::visitFMUL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -12254,10 +12182,6 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
      !isConstantFPBuildVectorOrConstantFP(N1))
     return DAG.getNode(ISD::FMUL, DL, VT, N1, N0, Flags);
 
-  // fold (fmul A, 1.0) -> A
-  if (N1CFP && N1CFP->isExactlyValue(1.0))
-    return N0;
-
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
 
@@ -12302,21 +12226,13 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
     if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
       return DAG.getNode(ISD::FNEG, DL, VT, N0);
 
-  // fold (fmul (fneg X), (fneg Y)) -> (fmul X, Y)
-  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options,
-                                       ForCodeSize)) {
-    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options,
-                                         ForCodeSize)) {
-      // Both can be negated for free, check to see if at least one is cheaper
-      // negated.
-      if (LHSNeg == 2 || RHSNeg == 2)
-        return DAG.getNode(ISD::FMUL, DL, VT,
-                           GetNegatedExpression(N0, DAG, LegalOperations,
-                                                ForCodeSize),
-                           GetNegatedExpression(N1, DAG, LegalOperations,
-                                                ForCodeSize),
-                           Flags);
-    }
+  // -N0 * -N1 --> N0 * N1
+  if (isCheaperToUseNegatedFPOps(N0, N1)) {
+    SDValue NegN0 =
+        TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize);
+    SDValue NegN1 =
+        TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize);
+    return DAG.getNode(ISD::FMUL, DL, VT, NegN0, NegN1, Flags);
   }
 
   // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X))
@@ -12395,6 +12311,15 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
     return DAG.getNode(ISD::FMA, DL, VT, N0, N1, N2);
   }
 
+  // (-N0 * -N1) + N2 --> (N0 * N1) + N2
+  if (isCheaperToUseNegatedFPOps(N0, N1)) {
+    SDValue NegN0 =
+        TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize);
+    SDValue NegN1 =
+        TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize);
+    return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2, Flags);
+  }
+
   if (UnsafeFPMath) {
     if (N0CFP && N0CFP->isZero())
       return N2;
@@ -12602,9 +12527,8 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
     // If this FDIV is part of a reciprocal square root, it may be folded
     // into a target-specific square root estimate instruction.
     if (N1.getOpcode() == ISD::FSQRT) {
-      if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags)) {
+      if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags))
         return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
-      }
     } else if (N1.getOpcode() == ISD::FP_EXTEND &&
                N1.getOperand(0).getOpcode() == ISD::FSQRT) {
       if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
@@ -12645,28 +12569,16 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
     }
 
     // Fold into a reciprocal estimate and multiply instead of a real divide.
-    if (SDValue RV = BuildReciprocalEstimate(N1, Flags)) {
-      AddToWorklist(RV.getNode());
-      return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
-    }
+    if (SDValue RV = BuildDivEstimate(N0, N1, Flags))
+      return RV;
   }
 
   // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
-  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options,
-                                       ForCodeSize)) {
-    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options,
-                                         ForCodeSize)) {
-      // Both can be negated for free, check to see if at least one is cheaper
-      // negated.
-      if (LHSNeg == 2 || RHSNeg == 2)
-        return DAG.getNode(ISD::FDIV, SDLoc(N), VT,
-                           GetNegatedExpression(N0, DAG, LegalOperations,
-                                                ForCodeSize),
-                           GetNegatedExpression(N1, DAG, LegalOperations,
-                                                ForCodeSize),
-                           Flags);
-    }
-  }
+  if (isCheaperToUseNegatedFPOps(N0, N1))
+    return DAG.getNode(
+        ISD::FDIV, SDLoc(N), VT,
+        TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize),
+        TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags);
 
   return SDValue();
 }
@@ -13112,22 +13024,6 @@ SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
   return SDValue();
 }
 
-SDValue DAGCombiner::visitFP_ROUND_INREG(SDNode *N) {
-  SDValue N0 = N->getOperand(0);
-  EVT VT = N->getValueType(0);
-  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
-  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
-
-  // fold (fp_round_inreg c1fp) -> c1fp
-  if (N0CFP && isTypeLegal(EVT)) {
-    SDLoc DL(N);
-    SDValue Round = DAG.getConstantFP(*N0CFP->getConstantFPValue(), DL, EVT);
-    return DAG.getNode(ISD::FP_EXTEND, DL, VT, Round);
-  }
-
-  return SDValue();
-}
-
 SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -13236,9 +13132,8 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
   if (isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0);
 
-  if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(),
-                         &DAG.getTarget().Options, ForCodeSize))
-    return GetNegatedExpression(N0, DAG, LegalOperations, ForCodeSize);
+  if (TLI.isNegatibleForFree(N0, DAG, LegalOperations, ForCodeSize))
+    return TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize);
 
   // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading
   // constant pool values.
@@ -14004,11 +13899,12 @@ bool DAGCombiner::extendLoadedValueToExtension(LoadSDNode *LD, SDValue &Val) {
 }
 
 SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
-  if (OptLevel == CodeGenOpt::None || LD->isVolatile())
+  if (OptLevel == CodeGenOpt::None || !LD->isSimple())
     return SDValue();
   SDValue Chain = LD->getOperand(0);
   StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain.getNode());
-  if (!ST || ST->isVolatile())
+  // TODO: Relax this restriction for unordered atomics (see D66309)
+  if (!ST || !ST->isSimple())
     return SDValue();
 
   EVT LDType = LD->getValueType(0);
@@ -14107,7 +14003,8 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
   // If load is not volatile and there are no uses of the loaded value (and
   // the updated indexed value in case of indexed loads), change uses of the
   // chain value into uses of the chain input (i.e. delete the dead load).
-  if (!LD->isVolatile()) {
+  // TODO: Allow this for unordered atomics (see D66309)
+  if (LD->isSimple()) {
     if (N->getValueType(1) == MVT::Other) {
       // Unindexed loads.
       if (!N->hasAnyUseOfValue(0)) {
@@ -14241,7 +14138,7 @@ struct LoadedSlice {
   /// Helper structure used to compute the cost of a slice.
   struct Cost {
     /// Are we optimizing for code size.
-    bool ForCodeSize;
+    bool ForCodeSize = false;
 
     /// Various cost.
     unsigned Loads = 0;
@@ -14250,10 +14147,10 @@ struct LoadedSlice {
     unsigned ZExts = 0;
     unsigned Shift = 0;
 
-    Cost(bool ForCodeSize = false) : ForCodeSize(ForCodeSize) {}
+    explicit Cost(bool ForCodeSize) : ForCodeSize(ForCodeSize) {}
 
     /// Get the cost of one isolated slice.
-    Cost(const LoadedSlice &LS, bool ForCodeSize = false)
+    Cost(const LoadedSlice &LS, bool ForCodeSize)
         : ForCodeSize(ForCodeSize), Loads(1) {
       EVT TruncType = LS.Inst->getValueType(0);
       EVT LoadedType = LS.getLoadedType();
@@ -14678,7 +14575,7 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) {
     return false;
 
   LoadSDNode *LD = cast<LoadSDNode>(N);
-  if (LD->isVolatile() || !ISD::isNormalLoad(LD) ||
+  if (!LD->isSimple() || !ISD::isNormalLoad(LD) ||
       !LD->getValueType(0).isInteger())
     return false;
 
@@ -14829,13 +14726,7 @@ CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
   else if (Chain->getOpcode() == ISD::TokenFactor &&
            SDValue(LD, 1).hasOneUse()) {
     // LD has only 1 chain use so they are no indirect dependencies.
-    bool isOk = false;
-    for (const SDValue &ChainOp : Chain->op_values())
-      if (ChainOp.getNode() == LD) {
-        isOk = true;
-        break;
-      }
-    if (!isOk)
+    if (!LD->isOperandOf(Chain.getNode()))
       return Result;
   } else
     return Result; // Fail.
@@ -14848,7 +14739,7 @@ CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
 /// Check to see if IVal is something that provides a value as specified by
 /// MaskInfo. If so, replace the specified store with a narrower store of
 /// truncated IVal.
-static SDNode *
+static SDValue
 ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
                                 SDValue IVal, StoreSDNode *St,
                                 DAGCombiner *DC) {
@@ -14860,14 +14751,19 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
   // that uses this.  If not, this is not a replacement.
   APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(),
                                   ByteShift*8, (ByteShift+NumBytes)*8);
-  if (!DAG.MaskedValueIsZero(IVal, Mask)) return nullptr;
+  if (!DAG.MaskedValueIsZero(IVal, Mask)) return SDValue();
 
   // Check that it is legal on the target to do this.  It is legal if the new
   // VT we're shrinking to (i8/i16/i32) is legal or we're still before type
-  // legalization.
-  MVT VT = MVT::getIntegerVT(NumBytes*8);
+  // legalization (and the target doesn't explicitly think this is a bad idea).
+  MVT VT = MVT::getIntegerVT(NumBytes * 8);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!DC->isTypeLegal(VT))
-    return nullptr;
+    return SDValue();
+  if (St->getMemOperand() &&
+      !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                              *St->getMemOperand()))
+    return SDValue();
 
   // Okay, we can do this!  Replace the 'St' store with a store of IVal that is
   // shifted by ByteShift and truncated down to NumBytes.
@@ -14901,8 +14797,7 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
   ++OpsNarrowed;
   return DAG
       .getStore(St->getChain(), SDLoc(St), IVal, Ptr,
-                St->getPointerInfo().getWithOffset(StOffset), NewAlign)
-      .getNode();
+                St->getPointerInfo().getWithOffset(StOffset), NewAlign);
 }
 
 /// Look for sequence of load / op / store where op is one of 'or', 'xor', and
@@ -14911,7 +14806,7 @@ ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
 /// or code size.
 SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
   StoreSDNode *ST  = cast<StoreSDNode>(N);
-  if (ST->isVolatile())
+  if (!ST->isSimple())
     return SDValue();
 
   SDValue Chain = ST->getChain();
@@ -14933,16 +14828,16 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
     std::pair<unsigned, unsigned> MaskedLoad;
     MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain);
     if (MaskedLoad.first)
-      if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
+      if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
                                                   Value.getOperand(1), ST,this))
-        return SDValue(NewST, 0);
+        return NewST;
 
     // Or is commutative, so try swapping X and Y.
     MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain);
     if (MaskedLoad.first)
-      if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
+      if (SDValue NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
                                                   Value.getOperand(0), ST,this))
-        return SDValue(NewST, 0);
+        return NewST;
   }
 
   if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) ||
@@ -15367,14 +15262,16 @@ void DAGCombiner::getStoreMergeCandidates(
     // Loads must only have one use.
     if (!Ld->hasNUsesOfValue(1, 0))
       return;
-    // The memory operands must not be volatile/indexed.
-    if (Ld->isVolatile() || Ld->isIndexed())
+    // The memory operands must not be volatile/indexed/atomic.
+    // TODO: May be able to relax for unordered atomics (see D66309)
+    if (!Ld->isSimple() || Ld->isIndexed())
       return;
   }
   auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr,
                             int64_t &Offset) -> bool {
-    // The memory operands must not be volatile/indexed.
-    if (Other->isVolatile() || Other->isIndexed())
+    // The memory operands must not be volatile/indexed/atomic.
+    // TODO: May be able to relax for unordered atomics (see D66309)
+    if (!Other->isSimple() ||  Other->isIndexed())
       return false;
     // Don't mix temporal stores with non-temporal stores.
     if (St->isNonTemporal() != Other->isNonTemporal())
@@ -15394,8 +15291,10 @@ void DAGCombiner::getStoreMergeCandidates(
         // Loads must only have one use.
         if (!OtherLd->hasNUsesOfValue(1, 0))
           return false;
-        // The memory operands must not be volatile/indexed.
-        if (OtherLd->isVolatile() || OtherLd->isIndexed())
+        // The memory operands must not be volatile/indexed/atomic.
+        // TODO: May be able to relax for unordered atomics (see D66309)
+        if (!OtherLd->isSimple() ||
+            OtherLd->isIndexed())
           return false;
         // Don't mix temporal loads with non-temporal loads.
         if (cast<LoadSDNode>(Val)->isNonTemporal() != OtherLd->isNonTemporal())
@@ -15425,6 +15324,18 @@ void DAGCombiner::getStoreMergeCandidates(
     return (BasePtr.equalBaseIndex(Ptr, DAG, Offset));
   };
 
+  // Check if the pair of StoreNode and the RootNode already bail out many
+  // times which is over the limit in dependence check.
+  auto OverLimitInDependenceCheck = [&](SDNode *StoreNode,
+                                        SDNode *RootNode) -> bool {
+    auto RootCount = StoreRootCountMap.find(StoreNode);
+    if (RootCount != StoreRootCountMap.end() &&
+        RootCount->second.first == RootNode &&
+        RootCount->second.second > StoreMergeDependenceLimit)
+      return true;
+    return false;
+  };
+
   // We looking for a root node which is an ancestor to all mergable
   // stores. We search up through a load, to our root and then down
   // through all children. For instance we will find Store{1,2,3} if
@@ -15454,7 +15365,8 @@ void DAGCombiner::getStoreMergeCandidates(
             if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I2)) {
               BaseIndexOffset Ptr;
               int64_t PtrDiff;
-              if (CandidateMatch(OtherST, Ptr, PtrDiff))
+              if (CandidateMatch(OtherST, Ptr, PtrDiff) &&
+                  !OverLimitInDependenceCheck(OtherST, RootNode))
                 StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
             }
   } else
@@ -15464,7 +15376,8 @@ void DAGCombiner::getStoreMergeCandidates(
         if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) {
           BaseIndexOffset Ptr;
           int64_t PtrDiff;
-          if (CandidateMatch(OtherST, Ptr, PtrDiff))
+          if (CandidateMatch(OtherST, Ptr, PtrDiff) &&
+              !OverLimitInDependenceCheck(OtherST, RootNode))
             StoreNodes.push_back(MemOpLink(OtherST, PtrDiff));
         }
 }
@@ -15522,13 +15435,24 @@ bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
   // Search through DAG. We can stop early if we find a store node.
   for (unsigned i = 0; i < NumStores; ++i)
     if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist,
-                                     Max))
+                                     Max)) {
+      // If the searching bail out, record the StoreNode and RootNode in the
+      // StoreRootCountMap. If we have seen the pair many times over a limit,
+      // we won't add the StoreNode into StoreNodes set again.
+      if (Visited.size() >= Max) {
+        auto &RootCount = StoreRootCountMap[StoreNodes[i].MemNode];
+        if (RootCount.first == RootNode)
+          RootCount.second++;
+        else
+          RootCount = {RootNode, 1};
+      }
       return false;
+    }
   return true;
 }
 
 bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
-  if (OptLevel == CodeGenOpt::None)
+  if (OptLevel == CodeGenOpt::None || !EnableStoreMerging)
     return false;
 
   EVT MemVT = St->getMemoryVT();
@@ -15588,7 +15512,7 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
 
   bool RV = false;
   while (StoreNodes.size() > 1) {
-    unsigned StartIdx = 0;
+    size_t StartIdx = 0;
     while ((StartIdx + 1 < StoreNodes.size()) &&
            StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes !=
                StoreNodes[StartIdx + 1].OffsetFromBase)
@@ -16113,7 +16037,7 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
   case MVT::ppcf128:
     return SDValue();
   case MVT::f32:
-    if ((isTypeLegal(MVT::i32) && !LegalOperations && !ST->isVolatile()) ||
+    if ((isTypeLegal(MVT::i32) && !LegalOperations && ST->isSimple()) ||
         TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
       ;
       Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
@@ -16125,7 +16049,7 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
     return SDValue();
   case MVT::f64:
     if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations &&
-         !ST->isVolatile()) ||
+         ST->isSimple()) ||
         TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
       ;
       Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
@@ -16134,7 +16058,7 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
                           Ptr, ST->getMemOperand());
     }
 
-    if (!ST->isVolatile() &&
+    if (ST->isSimple() &&
         TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
       // Many FP stores are not made apparent until after legalize, e.g. for
       // argument passing.  Since this is so common, custom legalize the
@@ -16181,7 +16105,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     // memory accesses. We don't care if the original type was legal or not
     // as we assume software couldn't rely on the number of accesses of an
     // illegal type.
-    if (((!LegalOperations && !ST->isVolatile()) ||
+    // TODO: May be able to relax for unordered atomics (see D66309)
+    if (((!LegalOperations && ST->isSimple()) ||
          TLI.isOperationLegal(ISD::STORE, SVT)) &&
         TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT,
                                      DAG, *ST->getMemOperand())) {
@@ -16242,9 +16167,8 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     // See if we can simplify the input to this truncstore with knowledge that
     // only the low bits are being used.  For example:
     // "truncstore (or (shl x, 8), y), i8"  -> "truncstore y, i8"
-    SDValue Shorter = DAG.GetDemandedBits(Value, TruncDemandedBits);
     AddToWorklist(Value.getNode());
-    if (Shorter)
+    if (SDValue Shorter = DAG.GetDemandedBits(Value, TruncDemandedBits))
       return DAG.getTruncStore(Chain, SDLoc(N), Shorter, Ptr, ST->getMemoryVT(),
                                ST->getMemOperand());
 
@@ -16263,9 +16187,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
 
   // If this is a load followed by a store to the same location, then the store
   // is dead/noop.
+  // TODO: Can relax for unordered atomics (see D66309)
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) {
     if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() &&
-        ST->isUnindexed() && !ST->isVolatile() &&
+        ST->isUnindexed() && ST->isSimple() &&
         // There can't be any side effects between the load and store, such as
         // a call or store.
         Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) {
@@ -16274,9 +16199,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     }
   }
 
+  // TODO: Can relax for unordered atomics (see D66309)
   if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
-    if (ST->isUnindexed() && !ST->isVolatile() && ST1->isUnindexed() &&
-        !ST1->isVolatile()) {
+    if (ST->isUnindexed() && ST->isSimple() &&
+        ST1->isUnindexed() && ST1->isSimple()) {
       if (ST1->getBasePtr() == Ptr && ST1->getValue() == Value &&
           ST->getMemoryVT() == ST1->getMemoryVT()) {
         // If this is a store followed by a store with the same value to the
@@ -16405,7 +16331,8 @@ SDValue DAGCombiner::visitLIFETIME_END(SDNode *N) {
       break;
     case ISD::STORE: {
       StoreSDNode *ST = dyn_cast<StoreSDNode>(Chain);
-      if (ST->isVolatile() || ST->isIndexed())
+      // TODO: Can relax for unordered atomics (see D66309)
+      if (!ST->isSimple() || ST->isIndexed())
         continue;
       const BaseIndexOffset StoreBase = BaseIndexOffset::match(ST, DAG);
       // If we store purely within object bounds just before its lifetime ends,
@@ -16456,6 +16383,11 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
   if (OptLevel == CodeGenOpt::None)
     return SDValue();
 
+  // Can't change the number of memory accesses for a volatile store or break
+  // atomicity for an atomic one.
+  if (!ST->isSimple())
+    return SDValue();
+
   SDValue Val = ST->getValue();
   SDLoc DL(ST);
 
@@ -16531,12 +16463,52 @@ SDValue DAGCombiner::splitMergedValStore(StoreSDNode *ST) {
 }
 
 /// Convert a disguised subvector insertion into a shuffle:
-/// insert_vector_elt V, (bitcast X from vector type), IdxC -->
-/// bitcast(shuffle (bitcast V), (extended X), Mask)
-/// Note: We do not use an insert_subvector node because that requires a legal
-/// subvector type.
 SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
   SDValue InsertVal = N->getOperand(1);
+  SDValue Vec = N->getOperand(0);
+
+  // (insert_vector_elt (vector_shuffle X, Y), (extract_vector_elt X, N), InsIndex)
+  //   --> (vector_shuffle X, Y)
+  if (Vec.getOpcode() == ISD::VECTOR_SHUFFLE && Vec.hasOneUse() &&
+      InsertVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      isa<ConstantSDNode>(InsertVal.getOperand(1))) {
+    ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Vec.getNode());
+    ArrayRef<int> Mask = SVN->getMask();
+
+    SDValue X = Vec.getOperand(0);
+    SDValue Y = Vec.getOperand(1);
+
+    // Vec's operand 0 is using indices from 0 to N-1 and
+    // operand 1 from N to 2N - 1, where N is the number of
+    // elements in the vectors.
+    int XOffset = -1;
+    if (InsertVal.getOperand(0) == X) {
+      XOffset = 0;
+    } else if (InsertVal.getOperand(0) == Y) {
+      XOffset = X.getValueType().getVectorNumElements();
+    }
+
+    if (XOffset != -1) {
+      SmallVector<int, 16> NewMask(Mask.begin(), Mask.end());
+
+      auto *ExtrIndex = cast<ConstantSDNode>(InsertVal.getOperand(1));
+      NewMask[InsIndex] = XOffset + ExtrIndex->getZExtValue();
+      assert(NewMask[InsIndex] <
+                 (int)(2 * Vec.getValueType().getVectorNumElements()) &&
+             NewMask[InsIndex] >= 0 && "NewMask[InsIndex] is out of bound");
+
+      SDValue LegalShuffle =
+              TLI.buildLegalVectorShuffle(Vec.getValueType(), SDLoc(N), X,
+                                          Y, NewMask, DAG);
+      if (LegalShuffle)
+        return LegalShuffle;
+    }
+  }
+
+  // insert_vector_elt V, (bitcast X from vector type), IdxC -->
+  // bitcast(shuffle (bitcast V), (extended X), Mask)
+  // Note: We do not use an insert_subvector node because that requires a
+  // legal subvector type.
   if (InsertVal.getOpcode() != ISD::BITCAST || !InsertVal.hasOneUse() ||
       !InsertVal.getOperand(0).getValueType().isVector())
     return SDValue();
@@ -16674,7 +16646,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
 SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
                                                   SDValue EltNo,
                                                   LoadSDNode *OriginalLoad) {
-  assert(!OriginalLoad->isVolatile());
+  assert(OriginalLoad->isSimple());
 
   EVT ResultVT = EVE->getValueType(0);
   EVT VecEltVT = InVecVT.getVectorElementType();
@@ -16747,12 +16719,12 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
   SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) };
   SDValue To[] = { Load, Chain };
   DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
+  // Make sure to revisit this node to clean it up; it will usually be dead.
+  AddToWorklist(EVE);
   // Since we're explicitly calling ReplaceAllUses, add the new node to the
   // worklist explicitly as well.
-  AddToWorklist(Load.getNode());
   AddUsersToWorklist(Load.getNode()); // Add users too
-  // Make sure to revisit this node to clean it up; it will usually be dead.
-  AddToWorklist(EVE);
+  AddToWorklist(Load.getNode());
   ++OpsNarrowed;
   return SDValue(EVE, 0);
 }
@@ -16982,7 +16954,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       ISD::isNormalLoad(VecOp.getNode()) &&
       !Index->hasPredecessor(VecOp.getNode())) {
     auto *VecLoad = dyn_cast<LoadSDNode>(VecOp);
-    if (VecLoad && !VecLoad->isVolatile())
+    if (VecLoad && VecLoad->isSimple())
       return scalarizeExtractedVectorLoad(N, VecVT, Index, VecLoad);
   }
 
@@ -17041,7 +17013,7 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
 
   // Make sure we found a non-volatile load and the extractelement is
   // the only use.
-  if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile())
+  if (!LN0 || !LN0->hasNUsesOfValue(1,0) || !LN0->isSimple())
     return SDValue();
 
   // If Idx was -1 above, Elt is going to be -1, so just return undef.
@@ -17344,17 +17316,16 @@ static SDValue reduceBuildVecToShuffleWithZero(SDNode *BV, SelectionDAG &DAG) {
     // the shuffle mask with -1.
   }
 
-  // Turn this into a shuffle with zero if that's legal.
-  EVT VecVT = Extract.getOperand(0).getValueType();
-  if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(ShufMask, VecVT))
-    return SDValue();
-
   // buildvec undef, ..., (zext (extractelt V, IndexC)), undef... -->
   // bitcast (shuffle V, ZeroVec, VectorMask)
   SDLoc DL(BV);
+  EVT VecVT = Extract.getOperand(0).getValueType();
   SDValue ZeroVec = DAG.getConstant(0, DL, VecVT);
-  SDValue Shuf = DAG.getVectorShuffle(VecVT, DL, Extract.getOperand(0), ZeroVec,
-                                      ShufMask);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Shuf = TLI.buildLegalVectorShuffle(VecVT, DL, Extract.getOperand(0),
+                                             ZeroVec, ShufMask, DAG);
+  if (!Shuf)
+    return SDValue();
   return DAG.getBitcast(VT, Shuf);
 }
 
@@ -17656,6 +17627,13 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
     }
   }
 
+  // A splat of a single element is a SPLAT_VECTOR if supported on the target.
+  if (TLI.getOperationAction(ISD::SPLAT_VECTOR, VT) != TargetLowering::Expand)
+    if (SDValue V = cast<BuildVectorSDNode>(N)->getSplatValue()) {
+      assert(!V.isUndef() && "Splat of undef should have been handled earlier");
+      return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), VT, V);
+    }
+
   // Check if we can express BUILD VECTOR via subvector extract.
   if (!LegalTypes && (N->getNumOperands() > 1)) {
     SDValue Op0 = N->getOperand(0);
@@ -17829,11 +17807,9 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
     }
   }
 
-  if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(Mask, VT))
-    return SDValue();
-
-  return DAG.getVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0),
-                              DAG.getBitcast(VT, SV1), Mask);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  return TLI.buildLegalVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0),
+                                     DAG.getBitcast(VT, SV1), Mask, DAG);
 }
 
 SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
@@ -17853,6 +17829,15 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
     SDValue In = N->getOperand(0);
     assert(In.getValueType().isVector() && "Must concat vectors");
 
+    // If the input is a concat_vectors, just make a larger concat by padding
+    // with smaller undefs.
+    if (In.getOpcode() == ISD::CONCAT_VECTORS && In.hasOneUse()) {
+      unsigned NumOps = N->getNumOperands() * In.getNumOperands();
+      SmallVector<SDValue, 4> Ops(In->op_begin(), In->op_end());
+      Ops.resize(NumOps, DAG.getUNDEF(Ops[0].getValueType()));
+      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
+    }
+
     SDValue Scalar = peekThroughOneUseBitcasts(In);
 
     // concat_vectors(scalar_to_vector(scalar), undef) ->
@@ -18002,6 +17987,23 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   return SDValue();
 }
 
+// Helper that peeks through INSERT_SUBVECTOR/CONCAT_VECTORS to find
+// if the subvector can be sourced for free.
+static SDValue getSubVectorSrc(SDValue V, SDValue Index, EVT SubVT) {
+  if (V.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      V.getOperand(1).getValueType() == SubVT && V.getOperand(2) == Index) {
+    return V.getOperand(1);
+  }
+  auto *IndexC = dyn_cast<ConstantSDNode>(Index);
+  if (IndexC && V.getOpcode() == ISD::CONCAT_VECTORS &&
+      V.getOperand(0).getValueType() == SubVT &&
+      (IndexC->getZExtValue() % SubVT.getVectorNumElements()) == 0) {
+    uint64_t SubIdx = IndexC->getZExtValue() / SubVT.getVectorNumElements();
+    return V.getOperand(SubIdx);
+  }
+  return SDValue();
+}
+
 static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract,
                                               SelectionDAG &DAG) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -18010,39 +18012,29 @@ static SDValue narrowInsertExtractVectorBinOp(SDNode *Extract,
   if (!TLI.isBinOp(BinOpcode) || BinOp.getNode()->getNumValues() != 1)
     return SDValue();
 
+  EVT VecVT = BinOp.getValueType();
   SDValue Bop0 = BinOp.getOperand(0), Bop1 = BinOp.getOperand(1);
-  SDValue Index = Extract->getOperand(1);
-  EVT VT = Extract->getValueType(0);
+  if (VecVT != Bop0.getValueType() || VecVT != Bop1.getValueType())
+    return SDValue();
 
-  // Helper that peeks through INSERT_SUBVECTOR/CONCAT_VECTORS to find
-  // if the source subvector is the same type as the one being extracted.
-  auto GetSubVector = [VT, Index](SDValue V) -> SDValue {
-    if (V.getOpcode() == ISD::INSERT_SUBVECTOR &&
-        V.getOperand(1).getValueType() == VT && V.getOperand(2) == Index) {
-      return V.getOperand(1);
-    }
-    auto *IndexC = dyn_cast<ConstantSDNode>(Index);
-    if (IndexC && V.getOpcode() == ISD::CONCAT_VECTORS &&
-        V.getOperand(0).getValueType() == VT &&
-        (IndexC->getZExtValue() % VT.getVectorNumElements()) == 0) {
-      uint64_t SubIdx = IndexC->getZExtValue() / VT.getVectorNumElements();
-      return V.getOperand(SubIdx);
-    }
+  SDValue Index = Extract->getOperand(1);
+  EVT SubVT = Extract->getValueType(0);
+  if (!TLI.isOperationLegalOrCustom(BinOpcode, SubVT))
     return SDValue();
-  };
-  SDValue Sub0 = GetSubVector(Bop0);
-  SDValue Sub1 = GetSubVector(Bop1);
+
+  SDValue Sub0 = getSubVectorSrc(Bop0, Index, SubVT);
+  SDValue Sub1 = getSubVectorSrc(Bop1, Index, SubVT);
 
   // TODO: We could handle the case where only 1 operand is being inserted by
   //       creating an extract of the other operand, but that requires checking
   //       number of uses and/or costs.
-  if (!Sub0 || !Sub1 || !TLI.isOperationLegalOrCustom(BinOpcode, VT))
+  if (!Sub0 || !Sub1)
     return SDValue();
 
   // We are inserting both operands of the wide binop only to extract back
   // to the narrow vector size. Eliminate all of the insert/extract:
   // ext (binop (ins ?, X, Index), (ins ?, Y, Index)), Index --> binop X, Y
-  return DAG.getNode(BinOpcode, SDLoc(Extract), VT, Sub0, Sub1,
+  return DAG.getNode(BinOpcode, SDLoc(Extract), SubVT, Sub0, Sub1,
                      BinOp->getFlags());
 }
 
@@ -18174,7 +18166,8 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
 
   auto *Ld = dyn_cast<LoadSDNode>(Extract->getOperand(0));
   auto *ExtIdx = dyn_cast<ConstantSDNode>(Extract->getOperand(1));
-  if (!Ld || Ld->getExtensionType() || Ld->isVolatile() || !ExtIdx)
+  if (!Ld || Ld->getExtensionType() || !Ld->isSimple() ||
+      !ExtIdx)
     return SDValue();
 
   // Allow targets to opt-out.
@@ -18878,7 +18871,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   // build_vector.
   if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
     int SplatIndex = SVN->getSplatIndex();
-    if (TLI.isExtractVecEltCheap(VT, SplatIndex) &&
+    if (N0.hasOneUse() && TLI.isExtractVecEltCheap(VT, SplatIndex) &&
         TLI.isBinOp(N0.getOpcode()) && N0.getNode()->getNumValues() == 1) {
       // splat (vector_bo L, R), Index -->
       // splat (scalar_bo (extelt L, Index), (extelt R, Index))
@@ -19153,22 +19146,13 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       SV1 = DAG.getUNDEF(VT);
 
     // Avoid introducing shuffles with illegal mask.
-    if (!TLI.isShuffleMaskLegal(Mask, VT)) {
-      ShuffleVectorSDNode::commuteMask(Mask);
-
-      if (!TLI.isShuffleMaskLegal(Mask, VT))
-        return SDValue();
-
-      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2)
-      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2)
-      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2)
-      std::swap(SV0, SV1);
-    }
-
     //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
     //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
     //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
-    return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask);
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2)
+    return TLI.buildLegalVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask, DAG);
   }
 
   if (SDValue V = foldShuffleOfConcatUndefs(SVN, DAG))
@@ -19191,35 +19175,35 @@ SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {
       SmallVector<int, 8> NewMask(InVecT.getVectorNumElements(), -1);
       int Elt = C0->getZExtValue();
       NewMask[0] = Elt;
-      SDValue Val;
       // If we have an implict truncate do truncate here as long as it's legal.
       // if it's not legal, this should
       if (VT.getScalarType() != InVal.getValueType() &&
           InVal.getValueType().isScalarInteger() &&
           isTypeLegal(VT.getScalarType())) {
-        Val =
+        SDValue Val =
             DAG.getNode(ISD::TRUNCATE, SDLoc(InVal), VT.getScalarType(), InVal);
         return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Val);
       }
       if (VT.getScalarType() == InVecT.getScalarType() &&
-          VT.getVectorNumElements() <= InVecT.getVectorNumElements() &&
-          TLI.isShuffleMaskLegal(NewMask, VT)) {
-        Val = DAG.getVectorShuffle(InVecT, SDLoc(N), InVec,
-                                   DAG.getUNDEF(InVecT), NewMask);
-        // If the initial vector is the correct size this shuffle is a
-        // valid result.
-        if (VT == InVecT)
-          return Val;
-        // If not we must truncate the vector.
-        if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) {
-          MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
-          SDValue ZeroIdx = DAG.getConstant(0, SDLoc(N), IdxTy);
-          EVT SubVT =
-              EVT::getVectorVT(*DAG.getContext(), InVecT.getVectorElementType(),
-                               VT.getVectorNumElements());
-          Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT, Val,
-                            ZeroIdx);
-          return Val;
+          VT.getVectorNumElements() <= InVecT.getVectorNumElements()) {
+        SDValue LegalShuffle =
+          TLI.buildLegalVectorShuffle(InVecT, SDLoc(N), InVec,
+                                      DAG.getUNDEF(InVecT), NewMask, DAG);
+        if (LegalShuffle) {
+          // If the initial vector is the correct size this shuffle is a
+          // valid result.
+          if (VT == InVecT)
+            return LegalShuffle;
+          // If not we must truncate the vector.
+          if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) {
+            MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+            SDValue ZeroIdx = DAG.getConstant(0, SDLoc(N), IdxTy);
+            EVT SubVT =
+                EVT::getVectorVT(*DAG.getContext(), InVecT.getVectorElementType(),
+                                 VT.getVectorNumElements());
+            return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT,
+                               LegalShuffle, ZeroIdx);
+          }
         }
       }
     }
@@ -19627,6 +19611,39 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
     }
   }
 
+  // Make sure all but the first op are undef or constant.
+  auto ConcatWithConstantOrUndef = [](SDValue Concat) {
+    return Concat.getOpcode() == ISD::CONCAT_VECTORS &&
+           std::all_of(std::next(Concat->op_begin()), Concat->op_end(),
+                     [](const SDValue &Op) {
+                       return Op.isUndef() ||
+                              ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
+                     });
+  };
+
+  // The following pattern is likely to emerge with vector reduction ops. Moving
+  // the binary operation ahead of the concat may allow using a narrower vector
+  // instruction that has better performance than the wide version of the op:
+  // VBinOp (concat X, undef/constant), (concat Y, undef/constant) -->
+  //   concat (VBinOp X, Y), VecC
+  if (ConcatWithConstantOrUndef(LHS) && ConcatWithConstantOrUndef(RHS) &&
+      (LHS.hasOneUse() || RHS.hasOneUse())) {
+    EVT NarrowVT = LHS.getOperand(0).getValueType();
+    if (NarrowVT == RHS.getOperand(0).getValueType() &&
+        TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) {
+      SDLoc DL(N);
+      unsigned NumOperands = LHS.getNumOperands();
+      SmallVector<SDValue, 4> ConcatOps;
+      for (unsigned i = 0; i != NumOperands; ++i) {
+        // This constant fold for operands 1 and up.
+        ConcatOps.push_back(DAG.getNode(Opcode, DL, NarrowVT, LHS.getOperand(i),
+                                        RHS.getOperand(i)));
+      }
+
+      return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
+    }
+  }
+
   if (SDValue V = scalarizeBinOpOfSplats(N, DAG))
     return V;
 
@@ -19723,7 +19740,9 @@ bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
     // Token chains must be identical.
     if (LHS.getOperand(0) != RHS.getOperand(0) ||
         // Do not let this transformation reduce the number of volatile loads.
-        LLD->isVolatile() || RLD->isVolatile() ||
+        // Be conservative for atomics for the moment
+        // TODO: This does appear to be legal for unordered atomics (see D66309)
+        !LLD->isSimple() || !RLD->isSimple() ||
         // FIXME: If either is a pre/post inc/dec load,
         // we'd need to split out the address adjustment.
         LLD->isIndexed() || RLD->isIndexed() ||
@@ -19928,7 +19947,7 @@ SDValue DAGCombiner::foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0,
 SDValue DAGCombiner::convertSelectOfFPConstantsToLoadOffset(
     const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2, SDValue N3,
     ISD::CondCode CC) {
-  if (!TLI.reduceSelectOfFPConstantLoads(N0.getValueType().isFloatingPoint()))
+  if (!TLI.reduceSelectOfFPConstantLoads(N0.getValueType()))
     return SDValue();
 
   // If we are before legalize types, we want the other legalization to happen
@@ -20016,8 +20035,13 @@ SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
   // when the condition can be materialized as an all-ones register.  Any
   // single bit-test can be materialized as an all-ones register with
   // shift-left and shift-right-arith.
+  // TODO: The operation legality checks could be loosened to include "custom",
+  //       but that may cause regressions for targets that do not have shift
+  //       instructions.
   if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND &&
-      N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) {
+      N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2) &&
+      TLI.isOperationLegal(ISD::SHL, VT) &&
+      TLI.isOperationLegal(ISD::SRA, VT)) {
     SDValue AndLHS = N0->getOperand(0);
     auto *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1));
     if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) {
@@ -20209,7 +20233,10 @@ SDValue DAGCombiner::BuildLogBase2(SDValue V, const SDLoc &DL) {
 ///     =>
 ///   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
 ///     does not require additional intermediate precision]
-SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags) {
+/// For the last iteration, put numerator N into it to gain more precision:
+///   Result = N X_i + X_i (N - N A X_i)
+SDValue DAGCombiner::BuildDivEstimate(SDValue N, SDValue Op,
+                                      SDNodeFlags Flags) {
   if (Level >= AfterLegalizeDAG)
     return SDValue();
 
@@ -20230,25 +20257,39 @@ SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags Flags) {
   if (SDValue Est = TLI.getRecipEstimate(Op, DAG, Enabled, Iterations)) {
     AddToWorklist(Est.getNode());
 
+    SDLoc DL(Op);
     if (Iterations) {
-      SDLoc DL(Op);
       SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
 
-      // Newton iterations: Est = Est + Est (1 - Arg * Est)
+      // Newton iterations: Est = Est + Est (N - Arg * Est)
+      // If this is the last iteration, also multiply by the numerator.
       for (int i = 0; i < Iterations; ++i) {
-        SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est, Flags);
+        SDValue MulEst = Est;
+
+        if (i == Iterations - 1) {
+          MulEst = DAG.getNode(ISD::FMUL, DL, VT, N, Est, Flags);
+          AddToWorklist(MulEst.getNode());
+        }
+
+        SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, MulEst, Flags);
         AddToWorklist(NewEst.getNode());
 
-        NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst, Flags);
+        NewEst = DAG.getNode(ISD::FSUB, DL, VT,
+                             (i == Iterations - 1 ? N : FPOne), NewEst, Flags);
         AddToWorklist(NewEst.getNode());
 
         NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
         AddToWorklist(NewEst.getNode());
 
-        Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst, Flags);
+        Est = DAG.getNode(ISD::FADD, DL, VT, MulEst, NewEst, Flags);
         AddToWorklist(Est.getNode());
       }
+    } else {
+      // If no iterations are available, multiply with N.
+      Est = DAG.getNode(ISD::FMUL, DL, VT, Est, N, Flags);
+      AddToWorklist(Est.getNode());
     }
+
     return Est;
   }
 
@@ -20271,31 +20312,19 @@ SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
   // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
   // this entire sequence requires only one FP constant.
   SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags);
-  AddToWorklist(HalfArg.getNode());
-
   HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags);
-  AddToWorklist(HalfArg.getNode());
 
   // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
   for (unsigned i = 0; i < Iterations; ++i) {
     SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags);
-    AddToWorklist(NewEst.getNode());
-
     NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags);
-    AddToWorklist(NewEst.getNode());
-
     NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags);
-    AddToWorklist(NewEst.getNode());
-
     Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
-    AddToWorklist(Est.getNode());
   }
 
   // If non-reciprocal square root is requested, multiply the result by Arg.
-  if (!Reciprocal) {
+  if (!Reciprocal)
     Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags);
-    AddToWorklist(Est.getNode());
-  }
 
   return Est;
 }
@@ -20321,13 +20350,8 @@ SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
   // E = (E * -0.5) * ((A * E) * E + -3.0)
   for (unsigned i = 0; i < Iterations; ++i) {
     SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags);
-    AddToWorklist(AE.getNode());
-
     SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags);
-    AddToWorklist(AEE.getNode());
-
     SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags);
-    AddToWorklist(RHS.getNode());
 
     // When calculating a square root at the last iteration build:
     // S = ((A * E) * -0.5) * ((A * E) * E + -3.0)
@@ -20340,10 +20364,8 @@ SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
       // SQRT: LHS = (A * E) * -0.5
       LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags);
     }
-    AddToWorklist(LHS.getNode());
 
     Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags);
-    AddToWorklist(Est.getNode());
   }
 
   return Est;
@@ -20400,16 +20422,11 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
           SDValue Fabs = DAG.getNode(ISD::FABS, DL, VT, Op);
           SDValue IsDenorm = DAG.getSetCC(DL, CCVT, Fabs, NormC, ISD::SETLT);
           Est = DAG.getNode(SelOpcode, DL, VT, IsDenorm, FPZero, Est);
-          AddToWorklist(Fabs.getNode());
-          AddToWorklist(IsDenorm.getNode());
-          AddToWorklist(Est.getNode());
         } else {
           // X == 0.0 ? 0.0 : Est
           SDValue FPZero = DAG.getConstantFP(0.0, DL, VT);
           SDValue IsZero = DAG.getSetCC(DL, CCVT, Op, FPZero, ISD::SETEQ);
           Est = DAG.getNode(SelOpcode, DL, VT, IsZero, FPZero, Est);
-          AddToWorklist(IsZero.getNode());
-          AddToWorklist(Est.getNode());
         }
       }
     }
@@ -20432,6 +20449,7 @@ bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const {
 
   struct MemUseCharacteristics {
     bool IsVolatile;
+    bool IsAtomic;
     SDValue BasePtr;
     int64_t Offset;
     Optional<int64_t> NumBytes;
@@ -20447,18 +20465,20 @@ bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const {
                      : (LSN->getAddressingMode() == ISD::PRE_DEC)
                            ? -1 * C->getSExtValue()
                            : 0;
-      return {LSN->isVolatile(), LSN->getBasePtr(), Offset /*base offset*/,
+      return {LSN->isVolatile(), LSN->isAtomic(), LSN->getBasePtr(),
+              Offset /*base offset*/,
               Optional<int64_t>(LSN->getMemoryVT().getStoreSize()),
               LSN->getMemOperand()};
     }
     if (const auto *LN = cast<LifetimeSDNode>(N))
-      return {false /*isVolatile*/, LN->getOperand(1),
+      return {false /*isVolatile*/, /*isAtomic*/ false, LN->getOperand(1),
               (LN->hasOffset()) ? LN->getOffset() : 0,
               (LN->hasOffset()) ? Optional<int64_t>(LN->getSize())
                                 : Optional<int64_t>(),
               (MachineMemOperand *)nullptr};
     // Default.
-    return {false /*isvolatile*/, SDValue(), (int64_t)0 /*offset*/,
+    return {false /*isvolatile*/, /*isAtomic*/ false, SDValue(),
+            (int64_t)0 /*offset*/,
             Optional<int64_t>() /*size*/, (MachineMemOperand *)nullptr};
   };
 
@@ -20474,6 +20494,11 @@ bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const {
   if (MUC0.IsVolatile && MUC1.IsVolatile)
     return true;
 
+  // Be conservative about atomics for the moment
+  // TODO: This is way overconservative for unordered atomics (see D66309)
+  if (MUC0.IsAtomic && MUC1.IsAtomic)
+    return true;
+
   if (MUC0.MMO && MUC1.MMO) {
     if ((MUC0.MMO->isInvariant() && MUC1.MMO->isStore()) ||
         (MUC1.MMO->isInvariant() && MUC0.MMO->isStore()))
@@ -20555,7 +20580,8 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
   SmallPtrSet<SDNode *, 16> Visited;  // Visited node set.
 
   // Get alias information for node.
-  const bool IsLoad = isa<LoadSDNode>(N) && !cast<LoadSDNode>(N)->isVolatile();
+  // TODO: relax aliasing for unordered atomics (see D66309)
+  const bool IsLoad = isa<LoadSDNode>(N) && cast<LoadSDNode>(N)->isSimple();
 
   // Starting off.
   Chains.push_back(OriginalChain);
@@ -20571,8 +20597,9 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
     case ISD::LOAD:
     case ISD::STORE: {
       // Get alias information for C.
+      // TODO: Relax aliasing for unordered atomics (see D66309)
       bool IsOpLoad = isa<LoadSDNode>(C.getNode()) &&
-                      !cast<LSBaseSDNode>(C.getNode())->isVolatile();
+                      cast<LSBaseSDNode>(C.getNode())->isSimple();
       if ((IsLoad && IsOpLoad) || !isAlias(N, C.getNode())) {
         // Look further up the chain.
         C = C.getOperand(0);
@@ -20727,7 +20754,8 @@ bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) {
     // If the chain has more than one use, then we can't reorder the mem ops.
     if (!SDValue(Chain, 0)->hasOneUse())
       break;
-    if (Chain->isVolatile() || Chain->isIndexed())
+    // TODO: Relax for unordered atomics (see D66309)
+    if (!Chain->isSimple() || Chain->isIndexed())
       break;
 
     // Find the base pointer and offset for this memory node.
@@ -20795,11 +20823,11 @@ bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) {
   SDValue TF = DAG.getTokenFactor(SDLoc(STChain), TFOps);
   CombineTo(St, TF);
 
-  AddToWorklist(STChain);
-  // Add TF operands worklist in reverse order.
-  for (auto I = TF->getNumOperands(); I;)
-    AddToWorklist(TF->getOperand(--I).getNode());
+  // Add TF and its operands to the worklist.
   AddToWorklist(TF.getNode());
+  for (const SDValue &Op : TF->ops())
+    AddToWorklist(Op.getNode());
+  AddToWorklist(STChain);
   return true;
 }
 
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 22c23ba877e8..6d7260d7aee5 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -174,7 +174,7 @@ static unsigned findSinkableLocalRegDef(MachineInstr &MI) {
       if (RegDef)
         return 0;
       RegDef = MO.getReg();
-    } else if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+    } else if (Register::isVirtualRegister(MO.getReg())) {
       // This is another use of a vreg. Don't try to sink it.
       return 0;
     }
@@ -1213,14 +1213,13 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
       if (!FrameAlign)
         FrameAlign = TLI.getByValTypeAlignment(ElementTy, DL);
       Flags.setByValSize(FrameSize);
-      Flags.setByValAlign(FrameAlign);
+      Flags.setByValAlign(Align(FrameAlign));
     }
     if (Arg.IsNest)
       Flags.setNest();
     if (NeedsRegBlock)
       Flags.setInConsecutiveRegs();
-    unsigned OriginalAlignment = DL.getABITypeAlignment(Arg.Ty);
-    Flags.setOrigAlign(OriginalAlignment);
+    Flags.setOrigAlign(Align(DL.getABITypeAlignment(Arg.Ty)));
 
     CLI.OutVals.push_back(Arg.Val);
     CLI.OutFlags.push_back(Flags);
@@ -1237,8 +1236,8 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
     updateValueMap(CLI.CS->getInstruction(), CLI.ResultReg, CLI.NumResultRegs);
 
   // Set labels for heapallocsite call.
-  if (CLI.CS && CLI.CS->getInstruction()->getMetadata("heapallocsite")) {
-    MDNode *MD = CLI.CS->getInstruction()->getMetadata("heapallocsite");
+  if (CLI.CS && CLI.CS->getInstruction()->hasMetadata("heapallocsite")) {
+    const MDNode *MD = CLI.CS->getInstruction()->getMetadata("heapallocsite");
     MF->addCodeViewHeapAllocSite(CLI.Call, MD);
   }
 
@@ -1303,6 +1302,7 @@ bool FastISel::selectCall(const User *I) {
       ExtraInfo |= InlineAsm::Extra_HasSideEffects;
     if (IA->isAlignStack())
       ExtraInfo |= InlineAsm::Extra_IsAlignStack;
+    ExtraInfo |= IA->getDialect() * InlineAsm::Extra_AsmDialect;
 
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::INLINEASM))
@@ -1388,9 +1388,11 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
              "Expected inlined-at fields to agree");
       // A dbg.declare describes the address of a source variable, so lower it
       // into an indirect DBG_VALUE.
+      auto *Expr = DI->getExpression();
+      Expr = DIExpression::append(Expr, {dwarf::DW_OP_deref});
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(TargetOpcode::DBG_VALUE), /*IsIndirect*/ true,
-              *Op, DI->getVariable(), DI->getExpression());
+              TII.get(TargetOpcode::DBG_VALUE), /*IsIndirect*/ false,
+              *Op, DI->getVariable(), Expr);
     } else {
       // We can't yet handle anything else here because it would require
       // generating code, thus altering codegen because of debug info.
@@ -1414,19 +1416,19 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
       if (CI->getBitWidth() > 64)
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
             .addCImm(CI)
-            .addImm(0U)
+            .addReg(0U)
             .addMetadata(DI->getVariable())
             .addMetadata(DI->getExpression());
       else
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
             .addImm(CI->getZExtValue())
-            .addImm(0U)
+            .addReg(0U)
             .addMetadata(DI->getVariable())
             .addMetadata(DI->getExpression());
     } else if (const auto *CF = dyn_cast<ConstantFP>(V)) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
           .addFPImm(CF)
-          .addImm(0U)
+          .addReg(0U)
           .addMetadata(DI->getVariable())
           .addMetadata(DI->getExpression());
     } else if (unsigned Reg = lookUpRegForValue(V)) {
@@ -1453,24 +1455,12 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
             TII.get(TargetOpcode::DBG_LABEL)).addMetadata(DI->getLabel());
     return true;
   }
-  case Intrinsic::objectsize: {
-    ConstantInt *CI = cast<ConstantInt>(II->getArgOperand(1));
-    unsigned long long Res = CI->isZero() ? -1ULL : 0;
-    Constant *ResCI = ConstantInt::get(II->getType(), Res);
-    unsigned ResultReg = getRegForValue(ResCI);
-    if (!ResultReg)
-      return false;
-    updateValueMap(II, ResultReg);
-    return true;
-  }
-  case Intrinsic::is_constant: {
-    Constant *ResCI = ConstantInt::get(II->getType(), 0);
-    unsigned ResultReg = getRegForValue(ResCI);
-    if (!ResultReg)
-      return false;
-    updateValueMap(II, ResultReg);
-    return true;
-  }
+  case Intrinsic::objectsize:
+    llvm_unreachable("llvm.objectsize.* should have been lowered already");
+
+  case Intrinsic::is_constant:
+    llvm_unreachable("llvm.is.constant.* should have been lowered already");
+
   case Intrinsic::launder_invariant_group:
   case Intrinsic::strip_invariant_group:
   case Intrinsic::expect: {
@@ -1677,11 +1667,11 @@ bool FastISel::selectInstruction(const Instruction *I) {
 /// (fall-through) successor, and update the CFG.
 void FastISel::fastEmitBranch(MachineBasicBlock *MSucc,
                               const DebugLoc &DbgLoc) {
-  if (FuncInfo.MBB->getBasicBlock()->size() > 1 &&
+  if (FuncInfo.MBB->getBasicBlock()->sizeWithoutDebug() > 1 &&
       FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
-    // For more accurate line information if this is the only instruction
-    // in the block then emit it, otherwise we have the unconditional
-    // fall-through case, which needs no instructions.
+    // For more accurate line information if this is the only non-debug
+    // instruction in the block then emit it, otherwise we have the
+    // unconditional fall-through case, which needs no instructions.
   } else {
     // The unconditional branch case.
     TII.insertBranch(*FuncInfo.MBB, MSucc, nullptr,
@@ -2028,7 +2018,7 @@ unsigned FastISel::createResultReg(const TargetRegisterClass *RC) {
 
 unsigned FastISel::constrainOperandRegClass(const MCInstrDesc &II, unsigned Op,
                                             unsigned OpNum) {
-  if (TargetRegisterInfo::isVirtualRegister(Op)) {
+  if (Register::isVirtualRegister(Op)) {
     const TargetRegisterClass *RegClass =
         TII.getRegClass(II, OpNum, &TRI, *FuncInfo.MF);
     if (!MRI.constrainRegClass(Op, RegClass)) {
@@ -2236,7 +2226,7 @@ unsigned FastISel::fastEmitInst_i(unsigned MachineInstOpcode,
 unsigned FastISel::fastEmitInst_extractsubreg(MVT RetVT, unsigned Op0,
                                               bool Op0IsKill, uint32_t Idx) {
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
-  assert(TargetRegisterInfo::isVirtualRegister(Op0) &&
+  assert(Register::isVirtualRegister(Op0) &&
          "Cannot yet extract from physregs");
   const TargetRegisterClass *RC = MRI.getRegClass(Op0);
   MRI.constrainRegClass(Op0, TRI.getSubClassWithSubReg(RC, Idx));
@@ -2417,10 +2407,9 @@ FastISel::createMachineMemOperandFor(const Instruction *I) const {
   } else
     return nullptr;
 
-  bool IsNonTemporal = I->getMetadata(LLVMContext::MD_nontemporal) != nullptr;
-  bool IsInvariant = I->getMetadata(LLVMContext::MD_invariant_load) != nullptr;
-  bool IsDereferenceable =
-      I->getMetadata(LLVMContext::MD_dereferenceable) != nullptr;
+  bool IsNonTemporal = I->hasMetadata(LLVMContext::MD_nontemporal);
+  bool IsInvariant = I->hasMetadata(LLVMContext::MD_invariant_load);
+  bool IsDereferenceable = I->hasMetadata(LLVMContext::MD_dereferenceable);
   const MDNode *Ranges = I->getMetadata(LLVMContext::MD_range);
 
   AAMDNodes AAInfo;
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 8b1759246b76..cf6711adad48 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -424,7 +425,7 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
   unsigned BitWidth = IntVT.getSizeInBits();
 
   unsigned DestReg = ValueMap[PN];
-  if (!TargetRegisterInfo::isVirtualRegister(DestReg))
+  if (!Register::isVirtualRegister(DestReg))
     return;
   LiveOutRegInfo.grow(DestReg);
   LiveOutInfo &DestLOI = LiveOutRegInfo[DestReg];
@@ -445,7 +446,7 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
     assert(ValueMap.count(V) && "V should have been placed in ValueMap when its"
                                 "CopyToReg node was created.");
     unsigned SrcReg = ValueMap[V];
-    if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+    if (!Register::isVirtualRegister(SrcReg)) {
       DestLOI.IsValid = false;
       return;
     }
@@ -480,7 +481,7 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
     assert(ValueMap.count(V) && "V should have been placed in ValueMap when "
                                 "its CopyToReg node was created.");
     unsigned SrcReg = ValueMap[V];
-    if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+    if (!Register::isVirtualRegister(SrcReg)) {
       DestLOI.IsValid = false;
       return;
     }
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 9bc07d35dfc5..c5095995ec2e 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -71,7 +71,7 @@ static unsigned countOperands(SDNode *Node, unsigned NumExpUses,
     if (isa<RegisterMaskSDNode>(Node->getOperand(I - 1)))
       continue;
     if (RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Node->getOperand(I - 1)))
-      if (TargetRegisterInfo::isPhysicalRegister(RN->getReg()))
+      if (Register::isPhysicalRegister(RN->getReg()))
         continue;
     NumImpUses = N - I;
     break;
@@ -86,7 +86,7 @@ void InstrEmitter::
 EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
                 unsigned SrcReg, DenseMap<SDValue, unsigned> &VRBaseMap) {
   unsigned VRBase = 0;
-  if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+  if (Register::isVirtualRegister(SrcReg)) {
     // Just use the input register directly!
     SDValue Op(Node, ResNo);
     if (IsClone)
@@ -114,7 +114,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
           User->getOperand(2).getNode() == Node &&
           User->getOperand(2).getResNo() == ResNo) {
         unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
-        if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+        if (Register::isVirtualRegister(DestReg)) {
           VRBase = DestReg;
           Match = false;
         } else if (DestReg != SrcReg)
@@ -139,7 +139,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
               UseRC = RC;
             else if (RC) {
               const TargetRegisterClass *ComRC =
-                TRI->getCommonSubClass(UseRC, RC, VT.SimpleTy);
+                TRI->getCommonSubClass(UseRC, RC);
               // If multiple uses expect disjoint register classes, we emit
               // copies in AddRegisterOperand.
               if (ComRC)
@@ -219,7 +219,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
     if (II.OpInfo[i].isOptionalDef()) {
       // Optional def must be a physical register.
       VRBase = cast<RegisterSDNode>(Node->getOperand(i-NumResults))->getReg();
-      assert(TargetRegisterInfo::isPhysicalRegister(VRBase));
+      assert(Register::isPhysicalRegister(VRBase));
       MIB.addReg(VRBase, RegState::Define);
     }
 
@@ -229,7 +229,7 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
             User->getOperand(2).getNode() == Node &&
             User->getOperand(2).getResNo() == i) {
           unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
-          if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+          if (Register::isVirtualRegister(Reg)) {
             const TargetRegisterClass *RegRC = MRI->getRegClass(Reg);
             if (RegRC == RC) {
               VRBase = Reg;
@@ -272,7 +272,7 @@ unsigned InstrEmitter::getVR(SDValue Op,
     // does not include operand register class info.
     const TargetRegisterClass *RC = TLI->getRegClassFor(
         Op.getSimpleValueType(), Op.getNode()->isDivergent());
-    unsigned VReg = MRI->createVirtualRegister(RC);
+    Register VReg = MRI->createVirtualRegister(RC);
     BuildMI(*MBB, InsertPos, Op.getDebugLoc(),
             TII->get(TargetOpcode::IMPLICIT_DEF), VReg);
     return VReg;
@@ -319,7 +319,7 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB,
       if (!ConstrainedRC) {
         OpRC = TRI->getAllocatableClass(OpRC);
         assert(OpRC && "Constraints cannot be fulfilled for allocation");
-        unsigned NewVReg = MRI->createVirtualRegister(OpRC);
+        Register NewVReg = MRI->createVirtualRegister(OpRC);
         BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
                 TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg);
         VReg = NewVReg;
@@ -385,9 +385,8 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB,
                                       (IIRC && TRI->isDivergentRegClass(IIRC)))
             : nullptr;
 
-    if (OpRC && IIRC && OpRC != IIRC &&
-        TargetRegisterInfo::isVirtualRegister(VReg)) {
-      unsigned NewVReg = MRI->createVirtualRegister(IIRC);
+    if (OpRC && IIRC && OpRC != IIRC && Register::isVirtualRegister(VReg)) {
+      Register NewVReg = MRI->createVirtualRegister(IIRC);
       BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
                TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg);
       VReg = NewVReg;
@@ -465,7 +464,7 @@ unsigned InstrEmitter::ConstrainForSubReg(unsigned VReg, unsigned SubIdx,
   // register instead.
   RC = TRI->getSubClassWithSubReg(TLI->getRegClassFor(VT, isDivergent), SubIdx);
   assert(RC && "No legal register class for VT supports that SubIdx");
-  unsigned NewReg = MRI->createVirtualRegister(RC);
+  Register NewReg = MRI->createVirtualRegister(RC);
   BuildMI(*MBB, InsertPos, DL, TII->get(TargetOpcode::COPY), NewReg)
     .addReg(VReg);
   return NewReg;
@@ -485,7 +484,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
     if (User->getOpcode() == ISD::CopyToReg &&
         User->getOperand(2).getNode() == Node) {
       unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
-      if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+      if (Register::isVirtualRegister(DestReg)) {
         VRBase = DestReg;
         break;
       }
@@ -503,7 +502,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
     unsigned Reg;
     MachineInstr *DefMI;
     RegisterSDNode *R = dyn_cast<RegisterSDNode>(Node->getOperand(0));
-    if (R && TargetRegisterInfo::isPhysicalRegister(R->getReg())) {
+    if (R && Register::isPhysicalRegister(R->getReg())) {
       Reg = R->getReg();
       DefMI = nullptr;
     } else {
@@ -529,7 +528,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
       // Reg may not support a SubIdx sub-register, and we may need to
       // constrain its register class or issue a COPY to a compatible register
       // class.
-      if (TargetRegisterInfo::isVirtualRegister(Reg))
+      if (Register::isVirtualRegister(Reg))
         Reg = ConstrainForSubReg(Reg, SubIdx,
                                  Node->getOperand(0).getSimpleValueType(),
                                  Node->isDivergent(), Node->getDebugLoc());
@@ -541,7 +540,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
       MachineInstrBuilder CopyMI =
           BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
                   TII->get(TargetOpcode::COPY), VRBase);
-      if (TargetRegisterInfo::isVirtualRegister(Reg))
+      if (Register::isVirtualRegister(Reg))
         CopyMI.addReg(Reg, 0, SubIdx);
       else
         CopyMI.addReg(TRI->getSubReg(Reg, SubIdx));
@@ -614,7 +613,7 @@ InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
   unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
   const TargetRegisterClass *DstRC =
     TRI->getAllocatableClass(TRI->getRegClass(DstRCIdx));
-  unsigned NewVReg = MRI->createVirtualRegister(DstRC);
+  Register NewVReg = MRI->createVirtualRegister(DstRC);
   BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
     NewVReg).addReg(VReg);
 
@@ -631,7 +630,7 @@ void InstrEmitter::EmitRegSequence(SDNode *Node,
                                   bool IsClone, bool IsCloned) {
   unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
   const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
-  unsigned NewVReg = MRI->createVirtualRegister(TRI->getAllocatableClass(RC));
+  Register NewVReg = MRI->createVirtualRegister(TRI->getAllocatableClass(RC));
   const MCInstrDesc &II = TII->get(TargetOpcode::REG_SEQUENCE);
   MachineInstrBuilder MIB = BuildMI(*MF, Node->getDebugLoc(), II, NewVReg);
   unsigned NumOps = Node->getNumOperands();
@@ -649,7 +648,7 @@ void InstrEmitter::EmitRegSequence(SDNode *Node,
       RegisterSDNode *R = dyn_cast<RegisterSDNode>(Node->getOperand(i-1));
       // Skip physical registers as they don't have a vreg to get and we'll
       // insert copies for them in TwoAddressInstructionPass anyway.
-      if (!R || !TargetRegisterInfo::isPhysicalRegister(R->getReg())) {
+      if (!R || !Register::isPhysicalRegister(R->getReg())) {
         unsigned SubIdx = cast<ConstantSDNode>(Op)->getZExtValue();
         unsigned SubReg = getVR(Node->getOperand(i-1), VRBaseMap);
         const TargetRegisterClass *TRC = MRI->getRegClass(SubReg);
@@ -678,7 +677,7 @@ MachineInstr *
 InstrEmitter::EmitDbgValue(SDDbgValue *SD,
                            DenseMap<SDValue, unsigned> &VRBaseMap) {
   MDNode *Var = SD->getVariable();
-  MDNode *Expr = SD->getExpression();
+  const DIExpression *Expr = SD->getExpression();
   DebugLoc DL = SD->getDebugLoc();
   assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
@@ -702,12 +701,11 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
     // EmitTargetCodeForFrameDebugValue is responsible for allocation.
     auto FrameMI = BuildMI(*MF, DL, TII->get(TargetOpcode::DBG_VALUE))
                        .addFrameIndex(SD->getFrameIx());
+
     if (SD->isIndirect())
-      // Push [fi + 0] onto the DIExpression stack.
-      FrameMI.addImm(0);
-    else
-      // Push fi onto the DIExpression stack.
-      FrameMI.addReg(0);
+      Expr = DIExpression::append(Expr, {dwarf::DW_OP_deref});
+
+    FrameMI.addReg(0);
     return FrameMI.addMetadata(Var).addMetadata(Expr);
   }
   // Otherwise, we're going to create an instruction here.
@@ -753,9 +751,9 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
 
   // Indirect addressing is indicated by an Imm as the second parameter.
   if (SD->isIndirect())
-    MIB.addImm(0U);
-  else
-    MIB.addReg(0U, RegState::Debug);
+    Expr = DIExpression::append(Expr, {dwarf::DW_OP_deref});
+
+  MIB.addReg(0U, RegState::Debug);
 
   MIB.addMetadata(Var);
   MIB.addMetadata(Expr);
@@ -928,12 +926,12 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
   //
   // Collect all the used physreg defs, and make sure that any unused physreg
   // defs are marked as dead.
-  SmallVector<unsigned, 8> UsedRegs;
+  SmallVector<Register, 8> UsedRegs;
 
   // Additional results must be physical register defs.
   if (HasPhysRegOuts) {
     for (unsigned i = NumDefs; i < NumResults; ++i) {
-      unsigned Reg = II.getImplicitDefs()[i - NumDefs];
+      Register Reg = II.getImplicitDefs()[i - NumDefs];
       if (!Node->hasAnyUseOfValue(i))
         continue;
       // This implicitly defined physreg has a use.
@@ -960,8 +958,8 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
       // direct RegisterSDNode operands.
       for (unsigned i = 0, e = F->getNumOperands(); i != e; ++i)
         if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(F->getOperand(i))) {
-          unsigned Reg = R->getReg();
-          if (TargetRegisterInfo::isPhysicalRegister(Reg))
+          Register Reg = R->getReg();
+          if (Reg.isPhysical())
             UsedRegs.push_back(Reg);
         }
     }
@@ -995,8 +993,7 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
   case ISD::CopyToReg: {
     unsigned DestReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
     SDValue SrcVal = Node->getOperand(2);
-    if (TargetRegisterInfo::isVirtualRegister(DestReg) &&
-        SrcVal.isMachineOpcode() &&
+    if (Register::isVirtualRegister(DestReg) && SrcVal.isMachineOpcode() &&
         SrcVal.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF) {
       // Instead building a COPY to that vreg destination, build an
       // IMPLICIT_DEF instruction instead.
@@ -1093,16 +1090,18 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
           // FIXME: Add dead flags for physical and virtual registers defined.
           // For now, mark physical register defs as implicit to help fast
           // regalloc. This makes inline asm look a lot like calls.
-          MIB.addReg(Reg, RegState::Define |
-                  getImplRegState(TargetRegisterInfo::isPhysicalRegister(Reg)));
+          MIB.addReg(Reg,
+                     RegState::Define |
+                         getImplRegState(Register::isPhysicalRegister(Reg)));
         }
         break;
       case InlineAsm::Kind_RegDefEarlyClobber:
       case InlineAsm::Kind_Clobber:
         for (unsigned j = 0; j != NumVals; ++j, ++i) {
           unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
-          MIB.addReg(Reg, RegState::Define | RegState::EarlyClobber |
-                  getImplRegState(TargetRegisterInfo::isPhysicalRegister(Reg)));
+          MIB.addReg(Reg,
+                     RegState::Define | RegState::EarlyClobber |
+                         getImplRegState(Register::isPhysicalRegister(Reg)));
           ECRegs.push_back(Reg);
         }
         break;
@@ -1136,7 +1135,7 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
     // then remove the early-clobber flag.
     for (unsigned Reg : ECRegs) {
       if (MIB->readsRegister(Reg, TRI)) {
-        MachineOperand *MO = 
+        MachineOperand *MO =
             MIB->findRegisterDefOperand(Reg, false, false, TRI);
         assert(MO && "No def operand for clobbered register?");
         MO->setIsEarlyClobber(false);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index bf817f00f83d..f9fdf525240f 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
@@ -161,6 +162,7 @@ private:
   SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
                            const SDLoc &dl, SDValue ChainIn);
   SDValue ExpandBUILD_VECTOR(SDNode *Node);
+  SDValue ExpandSPLAT_VECTOR(SDNode *Node);
   SDValue ExpandSCALAR_TO_VECTOR(SDNode *Node);
   void ExpandDYNAMIC_STACKALLOC(SDNode *Node,
                                 SmallVectorImpl<SDValue> &Results);
@@ -236,6 +238,16 @@ public:
     }
     ReplacedNode(Old);
   }
+
+  void ReplaceNodeWithValue(SDValue Old, SDValue New) {
+    LLVM_DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
+               dbgs() << "     with:      "; New->dump(&DAG));
+
+    DAG.ReplaceAllUsesOfValueWith(Old, New);
+    if (UpdatedNodes)
+      UpdatedNodes->insert(New.getNode());
+    ReplacedNode(Old.getNode());
+  }
 };
 
 } // end anonymous namespace
@@ -493,8 +505,8 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
       // expand it.
       EVT MemVT = ST->getMemoryVT();
       const DataLayout &DL = DAG.getDataLayout();
-      if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT,
-                                  *ST->getMemOperand())) {
+      if (!TLI.allowsMemoryAccessForAlignment(*DAG.getContext(), DL, MemVT,
+                                              *ST->getMemOperand())) {
         LLVM_DEBUG(dbgs() << "Expanding unsupported unaligned store\n");
         SDValue Result = TLI.expandUnalignedStore(ST, DAG);
         ReplaceNode(SDValue(ST, 0), Result);
@@ -608,8 +620,8 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
       EVT MemVT = ST->getMemoryVT();
       // If this is an unaligned store and the target doesn't support it,
       // expand it.
-      if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT,
-                                  *ST->getMemOperand())) {
+      if (!TLI.allowsMemoryAccessForAlignment(*DAG.getContext(), DL, MemVT,
+                                              *ST->getMemOperand())) {
         SDValue Result = TLI.expandUnalignedStore(ST, DAG);
         ReplaceNode(SDValue(ST, 0), Result);
       }
@@ -669,8 +681,8 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       const DataLayout &DL = DAG.getDataLayout();
       // If this is an unaligned load and the target doesn't support it,
       // expand it.
-      if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT,
-                                  *LD->getMemOperand())) {
+      if (!TLI.allowsMemoryAccessForAlignment(*DAG.getContext(), DL, MemVT,
+                                              *LD->getMemOperand())) {
         std::tie(RVal, RChain) = TLI.expandUnalignedLoad(LD, DAG);
       }
       break;
@@ -894,11 +906,10 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
         if (SrcVT.getScalarType() == MVT::f16) {
           EVT ISrcVT = SrcVT.changeTypeToInteger();
           EVT IDestVT = DestVT.changeTypeToInteger();
-          EVT LoadVT = TLI.getRegisterType(IDestVT.getSimpleVT());
+          EVT ILoadVT = TLI.getRegisterType(IDestVT.getSimpleVT());
 
-          SDValue Result = DAG.getExtLoad(ISD::ZEXTLOAD, dl, LoadVT,
-                                          Chain, Ptr, ISrcVT,
-                                          LD->getMemOperand());
+          SDValue Result = DAG.getExtLoad(ISD::ZEXTLOAD, dl, ILoadVT, Chain,
+                                          Ptr, ISrcVT, LD->getMemOperand());
           Value = DAG.getNode(ISD::FP16_TO_FP, dl, DestVT, Result);
           Chain = Result.getValue(1);
           break;
@@ -959,15 +970,13 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
 
 #ifndef NDEBUG
   for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
-    assert((TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
-              TargetLowering::TypeLegal ||
-            TLI.isTypeLegal(Node->getValueType(i))) &&
+    assert(TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
+             TargetLowering::TypeLegal &&
            "Unexpected illegal type!");
 
   for (const SDValue &Op : Node->op_values())
     assert((TLI.getTypeAction(*DAG.getContext(), Op.getValueType()) ==
               TargetLowering::TypeLegal ||
-            TLI.isTypeLegal(Op.getValueType()) ||
             Op.getOpcode() == ISD::TargetConstant ||
             Op.getOpcode() == ISD::Register) &&
             "Unexpected illegal type!");
@@ -1004,7 +1013,6 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     Action = TLI.getOperationAction(Node->getOpcode(),
                                     Node->getOperand(0).getValueType());
     break;
-  case ISD::FP_ROUND_INREG:
   case ISD::SIGN_EXTEND_INREG: {
     EVT InnerType = cast<VTSDNode>(Node->getOperand(1))->getVT();
     Action = TLI.getOperationAction(Node->getOpcode(), InnerType);
@@ -1097,38 +1105,15 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
       return;
     }
     break;
-  case ISD::STRICT_FADD:
-  case ISD::STRICT_FSUB:
-  case ISD::STRICT_FMUL:
-  case ISD::STRICT_FDIV:
-  case ISD::STRICT_FREM:
-  case ISD::STRICT_FSQRT:
-  case ISD::STRICT_FMA:
-  case ISD::STRICT_FPOW:
-  case ISD::STRICT_FPOWI:
-  case ISD::STRICT_FSIN:
-  case ISD::STRICT_FCOS:
-  case ISD::STRICT_FEXP:
-  case ISD::STRICT_FEXP2:
-  case ISD::STRICT_FLOG:
-  case ISD::STRICT_FLOG10:
-  case ISD::STRICT_FLOG2:
-  case ISD::STRICT_FRINT:
-  case ISD::STRICT_FNEARBYINT:
-  case ISD::STRICT_FMAXNUM:
-  case ISD::STRICT_FMINNUM:
-  case ISD::STRICT_FCEIL:
-  case ISD::STRICT_FFLOOR:
-  case ISD::STRICT_FROUND:
-  case ISD::STRICT_FTRUNC:
-  case ISD::STRICT_FP_ROUND:
-  case ISD::STRICT_FP_EXTEND:
-    // These pseudo-ops get legalized as if they were their non-strict
-    // equivalent.  For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT
-    // is also legal, but if ISD::FSQRT requires expansion then so does
-    // ISD::STRICT_FSQRT.
+  case ISD::STRICT_LRINT:
+  case ISD::STRICT_LLRINT:
+  case ISD::STRICT_LROUND:
+  case ISD::STRICT_LLROUND:
+    // These pseudo-ops are the same as the other STRICT_ ops except
+    // they are registered with setOperationAction() using the input type
+    // instead of the output type.
     Action = TLI.getStrictFPOperationAction(Node->getOpcode(),
-                                            Node->getValueType(0));
+                                            Node->getOperand(1).getValueType());
     break;
   case ISD::SADDSAT:
   case ISD::UADDSAT:
@@ -1139,7 +1124,8 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   }
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
-  case ISD::UMULFIX: {
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT: {
     unsigned Scale = Node->getConstantOperandVal(2);
     Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
                                               Node->getValueType(0), Scale);
@@ -1650,7 +1636,6 @@ bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT, SDValue &LHS,
   MVT OpVT = LHS.getSimpleValueType();
   ISD::CondCode CCCode = cast<CondCodeSDNode>(CC)->get();
   NeedInvert = false;
-  bool NeedSwap = false;
   switch (TLI.getCondCodeAction(CCCode, OpVT)) {
   default: llvm_unreachable("Unknown condition code action!");
   case TargetLowering::Legal:
@@ -1664,6 +1649,7 @@ bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT, SDValue &LHS,
       return true;
     }
     // Swapping operands didn't work. Try inverting the condition.
+    bool NeedSwap = false;
     InvCC = getSetCCInverse(CCCode, OpVT.isInteger());
     if (!TLI.isCondCodeLegalOrCustom(InvCC, OpVT)) {
       // If inverting the condition is not enough, try swapping operands
@@ -2021,6 +2007,14 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
   return ExpandVectorBuildThroughStack(Node);
 }
 
+SDValue SelectionDAGLegalize::ExpandSPLAT_VECTOR(SDNode *Node) {
+  SDLoc DL(Node);
+  EVT VT = Node->getValueType(0);
+  SDValue SplatVal = Node->getOperand(0);
+
+  return DAG.getSplatBuildVector(VT, DL, SplatVal);
+}
+
 // Expand a node into a call to a libcall.  If the result value
 // does not fit into a register, return the lo part and set the hi part to the
 // by-reg argument.  If it does fit into a single register, return the result
@@ -2074,12 +2068,12 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   if (!CallInfo.second.getNode()) {
-    LLVM_DEBUG(dbgs() << "Created tailcall: "; DAG.getRoot().dump());
+    LLVM_DEBUG(dbgs() << "Created tailcall: "; DAG.getRoot().dump(&DAG));
     // It's a tailcall, return the chain (which is the DAG root).
     return DAG.getRoot();
   }
 
-  LLVM_DEBUG(dbgs() << "Created libcall: "; CallInfo.first.dump());
+  LLVM_DEBUG(dbgs() << "Created libcall: "; CallInfo.first.dump(&DAG));
   return CallInfo.first;
 }
 
@@ -2167,6 +2161,9 @@ SDValue SelectionDAGLegalize::ExpandArgFPLibCall(SDNode* Node,
                                                  RTLIB::Libcall Call_F80,
                                                  RTLIB::Libcall Call_F128,
                                                  RTLIB::Libcall Call_PPCF128) {
+  if (Node->isStrictFPOpcode())
+    Node = DAG.mutateStrictFPToFP(Node);
+
   RTLIB::Libcall LC;
   switch (Node->getOperand(0).getValueType().getSimpleVT().SimpleTy) {
   default: llvm_unreachable("Unexpected request for libcall!");
@@ -2815,6 +2812,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     break;
   }
   case ISD::STRICT_FP_ROUND:
+    // This expansion does not honor the "strict" properties anyway,
+    // so prefer falling back to the non-strict operation if legal.
+    if (TLI.getStrictFPOperationAction(Node->getOpcode(),
+                                       Node->getValueType(0))
+        == TargetLowering::Legal)
+      break;
     Tmp1 = EmitStackConvert(Node->getOperand(1), 
                             Node->getValueType(0),
                             Node->getValueType(0), dl, Node->getOperand(0));
@@ -2829,6 +2832,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Tmp1);
     break;
   case ISD::STRICT_FP_EXTEND:
+    // This expansion does not honor the "strict" properties anyway,
+    // so prefer falling back to the non-strict operation if legal.
+    if (TLI.getStrictFPOperationAction(Node->getOpcode(),
+                                       Node->getValueType(0))
+        == TargetLowering::Legal)
+      break;
     Tmp1 = EmitStackConvert(Node->getOperand(1),
                             Node->getOperand(1).getValueType(),
                             Node->getValueType(0), dl, Node->getOperand(0));
@@ -2873,19 +2882,6 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(Tmp1);
     break;
   }
-  case ISD::FP_ROUND_INREG: {
-    // The only way we can lower this is to turn it into a TRUNCSTORE,
-    // EXTLOAD pair, targeting a temporary location (a stack slot).
-
-    // NOTE: there is a choice here between constantly creating new stack
-    // slots and always reusing the same one.  We currently always create
-    // new ones, as reuse may inhibit scheduling.
-    EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
-    Tmp1 = EmitStackConvert(Node->getOperand(0), ExtraVT,
-                            Node->getValueType(0), dl);
-    Results.push_back(Tmp1);
-    break;
-  }
   case ISD::UINT_TO_FP:
     if (TLI.expandUINT_TO_FP(Node, Tmp1, DAG)) {
       Results.push_back(Tmp1);
@@ -2901,33 +2897,26 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG))
       Results.push_back(Tmp1);
     break;
+  case ISD::STRICT_FP_TO_SINT:
+    if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG)) {
+      ReplaceNode(Node, Tmp1.getNode());
+      LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_TO_SINT node\n");
+      return true;
+    }
+    break;
   case ISD::FP_TO_UINT:
-    if (TLI.expandFP_TO_UINT(Node, Tmp1, DAG))
+    if (TLI.expandFP_TO_UINT(Node, Tmp1, Tmp2, DAG))
       Results.push_back(Tmp1);
     break;
-  case ISD::LROUND:
-    Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LROUND_F32,
-                                         RTLIB::LROUND_F64, RTLIB::LROUND_F80,
-                                         RTLIB::LROUND_F128,
-                                         RTLIB::LROUND_PPCF128));
-    break;
-  case ISD::LLROUND:
-    Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LLROUND_F32,
-                                         RTLIB::LLROUND_F64, RTLIB::LLROUND_F80,
-                                         RTLIB::LLROUND_F128,
-                                         RTLIB::LLROUND_PPCF128));
-    break;
-  case ISD::LRINT:
-    Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LRINT_F32,
-                                         RTLIB::LRINT_F64, RTLIB::LRINT_F80,
-                                         RTLIB::LRINT_F128,
-                                         RTLIB::LRINT_PPCF128));
-    break;
-  case ISD::LLRINT:
-    Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LLRINT_F32,
-                                         RTLIB::LLRINT_F64, RTLIB::LLRINT_F80,
-                                         RTLIB::LLRINT_F128,
-                                         RTLIB::LLRINT_PPCF128));
+  case ISD::STRICT_FP_TO_UINT:
+    if (TLI.expandFP_TO_UINT(Node, Tmp1, Tmp2, DAG)) {
+      // Relink the chain.
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Node,1), Tmp2);
+      // Replace the new UINT result.
+      ReplaceNodeWithValue(SDValue(Node, 0), Tmp1);
+      LLVM_DEBUG(dbgs() << "Successfully expanded STRICT_FP_TO_UINT node\n");
+      return true;
+    }
     break;
   case ISD::VAARG:
     Results.push_back(DAG.expandVAArg(Node));
@@ -3348,6 +3337,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:
     Results.push_back(TLI.expandFixedPointMul(Node, DAG));
     break;
   case ISD::ADDCARRY:
@@ -3662,6 +3652,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   case ISD::BUILD_VECTOR:
     Results.push_back(ExpandBUILD_VECTOR(Node));
     break;
+  case ISD::SPLAT_VECTOR:
+    Results.push_back(ExpandSPLAT_VECTOR(Node));
+    break;
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL: {
@@ -3715,6 +3708,33 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     break;
   }
 
+  if (Results.empty() && Node->isStrictFPOpcode()) {
+    // FIXME: We were asked to expand a strict floating-point operation,
+    // but there is currently no expansion implemented that would preserve
+    // the "strict" properties.  For now, we just fall back to the non-strict
+    // version if that is legal on the target.  The actual mutation of the
+    // operation will happen in SelectionDAGISel::DoInstructionSelection.
+    switch (Node->getOpcode()) {
+    default:
+      if (TLI.getStrictFPOperationAction(Node->getOpcode(),
+                                         Node->getValueType(0))
+          == TargetLowering::Legal)
+        return true;
+      break;
+    case ISD::STRICT_LRINT:
+    case ISD::STRICT_LLRINT:
+    case ISD::STRICT_LROUND:
+    case ISD::STRICT_LLROUND:
+      // These are registered by the operand type instead of the value
+      // type. Reflect that here.
+      if (TLI.getStrictFPOperationAction(Node->getOpcode(),
+                                         Node->getOperand(1).getValueType())
+          == TargetLowering::Legal)
+        return true;
+      break;
+    }
+  }
+
   // Replace the original node with the legalized result.
   if (Results.empty()) {
     LLVM_DEBUG(dbgs() << "Cannot expand node\n");
@@ -3956,6 +3976,34 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                                         RTLIB::POW_F80, RTLIB::POW_F128,
                                         RTLIB::POW_PPCF128));
     break;
+  case ISD::LROUND:
+  case ISD::STRICT_LROUND:
+    Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LROUND_F32,
+                                         RTLIB::LROUND_F64, RTLIB::LROUND_F80,
+                                         RTLIB::LROUND_F128,
+                                         RTLIB::LROUND_PPCF128));
+    break;
+  case ISD::LLROUND:
+  case ISD::STRICT_LLROUND:
+    Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LLROUND_F32,
+                                         RTLIB::LLROUND_F64, RTLIB::LLROUND_F80,
+                                         RTLIB::LLROUND_F128,
+                                         RTLIB::LLROUND_PPCF128));
+    break;
+  case ISD::LRINT:
+  case ISD::STRICT_LRINT:
+    Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LRINT_F32,
+                                         RTLIB::LRINT_F64, RTLIB::LRINT_F80,
+                                         RTLIB::LRINT_F128,
+                                         RTLIB::LRINT_PPCF128));
+    break;
+  case ISD::LLRINT:
+  case ISD::STRICT_LLRINT:
+    Results.push_back(ExpandArgFPLibCall(Node, RTLIB::LLRINT_F32,
+                                         RTLIB::LLRINT_F64, RTLIB::LLRINT_F80,
+                                         RTLIB::LLRINT_F128,
+                                         RTLIB::LLRINT_PPCF128));
+    break;
   case ISD::FDIV:
     Results.push_back(ExpandFPLibCall(Node, RTLIB::DIV_F32, RTLIB::DIV_F64,
                                       RTLIB::DIV_F80, RTLIB::DIV_F128,
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index b4849b2881e6..72d052473f11 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -42,10 +42,10 @@ static RTLIB::Libcall GetFPLibCall(EVT VT,
 }
 
 //===----------------------------------------------------------------------===//
-//  Convert Float Results to Integer for Non-HW-supported Operations.
+//  Convert Float Results to Integer
 //===----------------------------------------------------------------------===//
 
-bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
+void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
   LLVM_DEBUG(dbgs() << "Soften float result " << ResNo << ": "; N->dump(&DAG);
              dbgs() << "\n");
   SDValue R = SDValue();
@@ -58,26 +58,18 @@ bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
 #endif
     llvm_unreachable("Do not know how to soften the result of this operator!");
 
-    case ISD::Register:
-    case ISD::CopyFromReg:
-    case ISD::CopyToReg:
-      assert(isLegalInHWReg(N->getValueType(ResNo)) &&
-             "Unsupported SoftenFloatRes opcode!");
-      // Only when isLegalInHWReg, we can skip check of the operands.
-      R = SDValue(N, ResNo);
-      break;
     case ISD::MERGE_VALUES:R = SoftenFloatRes_MERGE_VALUES(N, ResNo); break;
-    case ISD::BITCAST:     R = SoftenFloatRes_BITCAST(N, ResNo); break;
+    case ISD::BITCAST:     R = SoftenFloatRes_BITCAST(N); break;
     case ISD::BUILD_PAIR:  R = SoftenFloatRes_BUILD_PAIR(N); break;
-    case ISD::ConstantFP:  R = SoftenFloatRes_ConstantFP(N, ResNo); break;
+    case ISD::ConstantFP:  R = SoftenFloatRes_ConstantFP(N); break;
     case ISD::EXTRACT_VECTOR_ELT:
       R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N, ResNo); break;
-    case ISD::FABS:        R = SoftenFloatRes_FABS(N, ResNo); break;
+    case ISD::FABS:        R = SoftenFloatRes_FABS(N); break;
     case ISD::FMINNUM:     R = SoftenFloatRes_FMINNUM(N); break;
     case ISD::FMAXNUM:     R = SoftenFloatRes_FMAXNUM(N); break;
     case ISD::FADD:        R = SoftenFloatRes_FADD(N); break;
     case ISD::FCEIL:       R = SoftenFloatRes_FCEIL(N); break;
-    case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N, ResNo); break;
+    case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N); break;
     case ISD::FCOS:        R = SoftenFloatRes_FCOS(N); break;
     case ISD::FDIV:        R = SoftenFloatRes_FDIV(N); break;
     case ISD::FEXP:        R = SoftenFloatRes_FEXP(N); break;
@@ -89,7 +81,7 @@ bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FMA:         R = SoftenFloatRes_FMA(N); break;
     case ISD::FMUL:        R = SoftenFloatRes_FMUL(N); break;
     case ISD::FNEARBYINT:  R = SoftenFloatRes_FNEARBYINT(N); break;
-    case ISD::FNEG:        R = SoftenFloatRes_FNEG(N, ResNo); break;
+    case ISD::FNEG:        R = SoftenFloatRes_FNEG(N); break;
     case ISD::FP_EXTEND:   R = SoftenFloatRes_FP_EXTEND(N); break;
     case ISD::FP_ROUND:    R = SoftenFloatRes_FP_ROUND(N); break;
     case ISD::FP16_TO_FP:  R = SoftenFloatRes_FP16_TO_FP(N); break;
@@ -102,30 +94,24 @@ bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FSQRT:       R = SoftenFloatRes_FSQRT(N); break;
     case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
     case ISD::FTRUNC:      R = SoftenFloatRes_FTRUNC(N); break;
-    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N, ResNo); break;
+    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N); break;
     case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
-    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N, ResNo); break;
-    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N, ResNo); break;
+    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N); break;
+    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N); break;
     case ISD::SINT_TO_FP:
     case ISD::UINT_TO_FP:  R = SoftenFloatRes_XINT_TO_FP(N); break;
     case ISD::UNDEF:       R = SoftenFloatRes_UNDEF(N); break;
     case ISD::VAARG:       R = SoftenFloatRes_VAARG(N); break;
   }
 
-  if (R.getNode() && R.getNode() != N) {
+  // If R is null, the sub-method took care of registering the result.
+  if (R.getNode()) {
+    assert(R.getNode() != N);
     SetSoftenedFloat(SDValue(N, ResNo), R);
-    // Return true only if the node is changed, assuming that the operands
-    // are also converted when necessary.
-    return true;
   }
-
-  // Otherwise, return false to tell caller to scan operands.
-  return false;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo) {
-  if (isLegalInHWReg(N->getValueType(ResNo)))
-    return SDValue(N, ResNo);
+SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N) {
   return BitConvertToInteger(N->getOperand(0));
 }
 
@@ -144,10 +130,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_BUILD_PAIR(SDNode *N) {
                      BitConvertToInteger(N->getOperand(1)));
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo) {
-  // When LegalInHWReg, we can load better from the constant pool.
-  if (isLegalInHWReg(N->getValueType(ResNo)))
-    return SDValue(N, ResNo);
+SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(SDNode *N) {
   ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N);
   // In ppcf128, the high 64 bits are always first in memory regardless
   // of Endianness. LLVM's APFloat representation is not Endian sensitive,
@@ -172,19 +155,13 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo) {
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N, unsigned ResNo) {
-  // When LegalInHWReg, keep the extracted value in register.
-  if (isLegalInHWReg(N->getValueType(ResNo)))
-    return SDValue(N, ResNo);
   SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0));
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
                      NewOp.getValueType().getVectorElementType(),
                      NewOp, N->getOperand(1));
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N, unsigned ResNo) {
-  // When LegalInHWReg, FABS can be implemented as native bitwise operations.
-  if (isLegalInHWReg(N->getValueType(ResNo)))
-    return SDValue(N, ResNo);
+SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned Size = NVT.getSizeInBits();
 
@@ -200,57 +177,69 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMINNUM(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::FMIN_F32,
                                            RTLIB::FMIN_F64,
                                            RTLIB::FMIN_F80,
                                            RTLIB::FMIN_F128,
                                            RTLIB::FMIN_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXNUM(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::FMAX_F32,
                                            RTLIB::FMAX_F64,
                                            RTLIB::FMAX_F80,
                                            RTLIB::FMAX_F128,
                                            RTLIB::FMAX_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::ADD_F32,
                                            RTLIB::ADD_F64,
                                            RTLIB::ADD_F80,
                                            RTLIB::ADD_F128,
                                            RTLIB::ADD_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::CEIL_F32,
                                            RTLIB::CEIL_F64,
                                            RTLIB::CEIL_F80,
                                            RTLIB::CEIL_F128,
                                            RTLIB::CEIL_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo) {
-  // When LegalInHWReg, FCOPYSIGN can be implemented as native bitwise operations.
-  if (isLegalInHWReg(N->getValueType(ResNo)))
-    return SDValue(N, ResNo);
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N) {
   SDValue LHS = GetSoftenedFloat(N->getOperand(0));
   SDValue RHS = BitConvertToInteger(N->getOperand(1));
   SDLoc dl(N);
@@ -301,98 +290,123 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo) {
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCOS(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::COS_F32,
                                            RTLIB::COS_F64,
                                            RTLIB::COS_F80,
                                            RTLIB::COS_F128,
                                            RTLIB::COS_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::DIV_F32,
                                            RTLIB::DIV_F64,
                                            RTLIB::DIV_F80,
                                            RTLIB::DIV_F128,
                                            RTLIB::DIV_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::EXP_F32,
                                            RTLIB::EXP_F64,
                                            RTLIB::EXP_F80,
                                            RTLIB::EXP_F128,
                                            RTLIB::EXP_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::EXP2_F32,
                                            RTLIB::EXP2_F64,
                                            RTLIB::EXP2_F80,
                                            RTLIB::EXP2_F128,
                                            RTLIB::EXP2_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::FLOOR_F32,
                                            RTLIB::FLOOR_F64,
                                            RTLIB::FLOOR_F80,
                                            RTLIB::FLOOR_F128,
                                            RTLIB::FLOOR_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::LOG_F32,
                                            RTLIB::LOG_F64,
                                            RTLIB::LOG_F80,
                                            RTLIB::LOG_F128,
                                            RTLIB::LOG_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::LOG2_F32,
                                            RTLIB::LOG2_F64,
                                            RTLIB::LOG2_F80,
                                            RTLIB::LOG2_F128,
                                            RTLIB::LOG2_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::LOG10_F32,
                                            RTLIB::LOG10_F64,
                                            RTLIB::LOG10_F80,
                                            RTLIB::LOG10_F128,
                                            RTLIB::LOG10_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
@@ -400,48 +414,57 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
   SDValue Ops[3] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)),
                      GetSoftenedFloat(N->getOperand(2)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[3] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType(),
+                   N->getOperand(2).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::FMA_F32,
                                            RTLIB::FMA_F64,
                                            RTLIB::FMA_F80,
                                            RTLIB::FMA_F128,
                                            RTLIB::FMA_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::MUL_F32,
                                            RTLIB::MUL_F64,
                                            RTLIB::MUL_F80,
                                            RTLIB::MUL_F128,
                                            RTLIB::MUL_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::NEARBYINT_F32,
                                            RTLIB::NEARBYINT_F64,
                                            RTLIB::NEARBYINT_F80,
                                            RTLIB::NEARBYINT_F128,
                                            RTLIB::NEARBYINT_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo) {
-  // When LegalInHWReg, FNEG can be implemented as native bitwise operations.
-  if (isLegalInHWReg(N->getValueType(ResNo)))
-    return SDValue(N, ResNo);
+SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
 
-  EVT FloatVT = N->getValueType(ResNo);
+  EVT FloatVT = N->getValueType(0);
   if (FloatVT == MVT::f32 || FloatVT == MVT::f64 || FloatVT == MVT::f128) {
     // Expand Y = FNEG(X) -> Y = X ^ sign mask
     APInt SignMask = APInt::getSignMask(NVT.getSizeInBits());
@@ -452,13 +475,14 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo) {
   // Expand Y = FNEG(X) -> Y = SUB -0.0, X
   SDValue Ops[2] = { DAG.getConstantFP(-0.0, dl, N->getValueType(0)),
                      GetSoftenedFloat(N->getOperand(0)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::SUB_F32,
                                            RTLIB::SUB_F64,
                                            RTLIB::SUB_F80,
                                            RTLIB::SUB_F128,
                                            RTLIB::SUB_PPCF128),
-                         NVT, Ops, false, dl).first;
+                         NVT, Ops, CallOptions, dl).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
@@ -485,7 +509,10 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
 
   RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
-  return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  return TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 // FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special
@@ -493,15 +520,18 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP(SDNode *N) {
   EVT MidVT = TLI.getTypeToTransformTo(*DAG.getContext(), MVT::f32);
   SDValue Op = N->getOperand(0);
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   SDValue Res32 = TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, MidVT, Op,
-                                  false, SDLoc(N)).first;
+                                  CallOptions, SDLoc(N)).first;
   if (N->getValueType(0) == MVT::f32)
     return Res32;
 
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   RTLIB::Libcall LC = RTLIB::getFPEXT(MVT::f32, N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
-  return TLI.makeLibCall(DAG, LC, NVT, Res32, false, SDLoc(N)).first;
+  return TLI.makeLibCall(DAG, LC, NVT, Res32, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
@@ -515,20 +545,27 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
 
   RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), N->getValueType(0));
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!");
-  return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  return TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::POW_F32,
                                            RTLIB::POW_F64,
                                            RTLIB::POW_F80,
                                            RTLIB::POW_F128,
                                            RTLIB::POW_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
@@ -536,87 +573,111 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
          "Unsupported power type!");
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)), N->getOperand(1) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::POWI_F32,
                                            RTLIB::POWI_F64,
                                            RTLIB::POWI_F80,
                                            RTLIB::POWI_F128,
                                            RTLIB::POWI_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::REM_F32,
                                            RTLIB::REM_F64,
                                            RTLIB::REM_F80,
                                            RTLIB::REM_F128,
                                            RTLIB::REM_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::RINT_F32,
                                            RTLIB::RINT_F64,
                                            RTLIB::RINT_F80,
                                            RTLIB::RINT_F128,
                                            RTLIB::RINT_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FROUND(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::ROUND_F32,
                                            RTLIB::ROUND_F64,
                                            RTLIB::ROUND_F80,
                                            RTLIB::ROUND_F128,
                                            RTLIB::ROUND_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::SIN_F32,
                                            RTLIB::SIN_F64,
                                            RTLIB::SIN_F80,
                                            RTLIB::SIN_F128,
                                            RTLIB::SIN_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::SQRT_F32,
                                            RTLIB::SQRT_F64,
                                            RTLIB::SQRT_F80,
                                            RTLIB::SQRT_F128,
                                            RTLIB::SQRT_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
                      GetSoftenedFloat(N->getOperand(1)) };
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { N->getOperand(0).getValueType(),
+                   N->getOperand(1).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::SUB_F32,
                                            RTLIB::SUB_F64,
                                            RTLIB::SUB_F80,
                                            RTLIB::SUB_F128,
                                            RTLIB::SUB_PPCF128),
-                         NVT, Ops, false, SDLoc(N)).first;
+                         NVT, Ops, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
@@ -625,17 +686,19 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
     return DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), NVT, N->getOperand(0));
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                            RTLIB::TRUNC_F32,
                                            RTLIB::TRUNC_F64,
                                            RTLIB::TRUNC_F80,
                                            RTLIB::TRUNC_F128,
                                            RTLIB::TRUNC_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo) {
-  bool LegalInHWReg = isLegalInHWReg(N->getValueType(ResNo));
+SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
   LoadSDNode *L = cast<LoadSDNode>(N);
   EVT VT = N->getValueType(0);
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
@@ -666,23 +729,17 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo) {
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
   auto ExtendNode = DAG.getNode(ISD::FP_EXTEND, dl, VT, NewL);
-  if (LegalInHWReg)
-    return ExtendNode;
   return BitConvertToInteger(ExtendNode);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N, unsigned ResNo) {
-  if (isLegalInHWReg(N->getValueType(ResNo)))
-    return SDValue(N, ResNo);
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N) {
   SDValue LHS = GetSoftenedFloat(N->getOperand(1));
   SDValue RHS = GetSoftenedFloat(N->getOperand(2));
   return DAG.getSelect(SDLoc(N),
                        LHS.getValueType(), N->getOperand(0), LHS, RHS);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N, unsigned ResNo) {
-  if (isLegalInHWReg(N->getValueType(ResNo)))
-    return SDValue(N, ResNo);
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N) {
   SDValue LHS = GetSoftenedFloat(N->getOperand(2));
   SDValue RHS = GetSoftenedFloat(N->getOperand(3));
   return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
@@ -736,14 +793,18 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
   // Sign/zero extend the argument if the libcall takes a larger type.
   SDValue Op = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
                            NVT, N->getOperand(0));
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(Signed);
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, LC,
                          TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
-                         Op, Signed, dl).first;
+                         Op, CallOptions, dl).first;
 }
 
 
 //===----------------------------------------------------------------------===//
-//  Convert Float Operand to Integer for Non-HW-supported Operations.
+//  Convert Float Operand to Integer
 //===----------------------------------------------------------------------===//
 
 bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
@@ -753,8 +814,6 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
 
   switch (N->getOpcode()) {
   default:
-    if (CanSkipSoftenFloatOperand(N, OpNo))
-      return false;
 #ifndef NDEBUG
     dbgs() << "SoftenFloatOperand Op #" << OpNo << ": ";
     N->dump(&DAG); dbgs() << "\n";
@@ -762,11 +821,7 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
     llvm_unreachable("Do not know how to soften this operator's operand!");
 
   case ISD::BITCAST:     Res = SoftenFloatOp_BITCAST(N); break;
-  case ISD::CopyToReg:   Res = SoftenFloatOp_COPY_TO_REG(N); break;
   case ISD::BR_CC:       Res = SoftenFloatOp_BR_CC(N); break;
-  case ISD::FABS:        Res = SoftenFloatOp_FABS(N); break;
-  case ISD::FCOPYSIGN:   Res = SoftenFloatOp_FCOPYSIGN(N); break;
-  case ISD::FNEG:        Res = SoftenFloatOp_FNEG(N); break;
   case ISD::FP_EXTEND:   Res = SoftenFloatOp_FP_EXTEND(N); break;
   case ISD::FP_TO_FP16:  // Same as FP_ROUND for softening purposes
   case ISD::FP_ROUND:    Res = SoftenFloatOp_FP_ROUND(N); break;
@@ -776,19 +831,9 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
   case ISD::LLROUND:     Res = SoftenFloatOp_LLROUND(N); break;
   case ISD::LRINT:       Res = SoftenFloatOp_LRINT(N); break;
   case ISD::LLRINT:      Res = SoftenFloatOp_LLRINT(N); break;
-  case ISD::SELECT:      Res = SoftenFloatOp_SELECT(N); break;
   case ISD::SELECT_CC:   Res = SoftenFloatOp_SELECT_CC(N); break;
   case ISD::SETCC:       Res = SoftenFloatOp_SETCC(N); break;
-  case ISD::STORE:
-    Res = SoftenFloatOp_STORE(N, OpNo);
-    // Do not try to analyze or soften this node again if the value is
-    // or can be held in a register. In that case, Res.getNode() should
-    // be equal to N.
-    if (Res.getNode() == N &&
-        isLegalInHWReg(N->getOperand(OpNo).getValueType()))
-      return false;
-    // Otherwise, we need to reanalyze and lower the new Res nodes.
-    break;
+  case ISD::STORE:       Res = SoftenFloatOp_STORE(N, OpNo); break;
   }
 
   // If the result is null, the sub-method took care of registering results etc.
@@ -800,60 +845,16 @@ bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
     return true;
 
   assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
-         "Invalid operand expansion");
+         "Invalid operand promotion");
 
   ReplaceValueWith(SDValue(N, 0), Res);
   return false;
 }
 
-bool DAGTypeLegalizer::CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo) {
-  if (!isLegalInHWReg(N->getOperand(OpNo).getValueType()))
-    return false;
-
-  // When the operand type can be kept in registers there is nothing to do for
-  // the following opcodes.
-  switch (N->getOperand(OpNo).getOpcode()) {
-    case ISD::BITCAST:
-    case ISD::ConstantFP:
-    case ISD::CopyFromReg:
-    case ISD::CopyToReg:
-    case ISD::FABS:
-    case ISD::FCOPYSIGN:
-    case ISD::FNEG:
-    case ISD::Register:
-    case ISD::SELECT:
-    case ISD::SELECT_CC:
-      return true;
-  }
-
-  switch (N->getOpcode()) {
-    case ISD::ConstantFP:  // Leaf node.
-    case ISD::CopyFromReg: // Operand is a register that we know to be left
-                           // unchanged by SoftenFloatResult().
-    case ISD::Register:    // Leaf node.
-      return true;
-  }
-  return false;
-}
-
 SDValue DAGTypeLegalizer::SoftenFloatOp_BITCAST(SDNode *N) {
-  return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0),
-                     GetSoftenedFloat(N->getOperand(0)));
-}
-
-SDValue DAGTypeLegalizer::SoftenFloatOp_COPY_TO_REG(SDNode *N) {
-  SDValue Op1 = GetSoftenedFloat(N->getOperand(1));
-  SDValue Op2 = GetSoftenedFloat(N->getOperand(2));
-
-  if (Op1 == N->getOperand(1) && Op2 == N->getOperand(2))
-    return SDValue();
-
-  if (N->getNumOperands() == 3)
-    return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op1, Op2), 0);
+  SDValue Op0 = GetSoftenedFloat(N->getOperand(0));
 
-  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op1, Op2,
-                                        N->getOperand(3)),
-                 0);
+  return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op0);
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP_EXTEND(SDNode *N) {
@@ -868,7 +869,10 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_EXTEND(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND libcall");
 
-  return TLI.makeLibCall(DAG, LC, RVT, Op, false, SDLoc(N)).first;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  return TLI.makeLibCall(DAG, LC, RVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 
@@ -885,7 +889,10 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall");
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  return TLI.makeLibCall(DAG, LC, RVT, Op, false, SDLoc(N)).first;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  return TLI.makeLibCall(DAG, LC, RVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
@@ -895,7 +902,8 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
   EVT VT = NewLHS.getValueType();
   NewLHS = GetSoftenedFloat(NewLHS);
   NewRHS = GetSoftenedFloat(NewRHS);
-  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N));
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N),
+                          N->getOperand(2), N->getOperand(3));
 
   // If softenSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
@@ -911,34 +919,6 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
                  0);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatOp_FABS(SDNode *N) {
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-
-  if (Op == N->getOperand(0))
-    return SDValue();
-
-  return SDValue(DAG.UpdateNodeOperands(N, Op), 0);
-}
-
-SDValue DAGTypeLegalizer::SoftenFloatOp_FCOPYSIGN(SDNode *N) {
-  SDValue Op0 = GetSoftenedFloat(N->getOperand(0));
-  SDValue Op1 = GetSoftenedFloat(N->getOperand(1));
-
-  if (Op0 == N->getOperand(0) && Op1 == N->getOperand(1))
-    return SDValue();
-
-  return SDValue(DAG.UpdateNodeOperands(N, Op0, Op1), 0);
-}
-
-SDValue DAGTypeLegalizer::SoftenFloatOp_FNEG(SDNode *N) {
-  SDValue Op = GetSoftenedFloat(N->getOperand(0));
-
-  if (Op == N->getOperand(0))
-    return SDValue();
-
-  return SDValue(DAG.UpdateNodeOperands(N, Op), 0);
-}
-
 SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) {
   bool Signed = N->getOpcode() == ISD::FP_TO_SINT;
   EVT SVT = N->getOperand(0).getValueType();
@@ -962,23 +942,15 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) {
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_XINT!");
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
-  SDValue Res = TLI.makeLibCall(DAG, LC, NVT, Op, false, dl).first;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
+  SDValue Res = TLI.makeLibCall(DAG, LC, NVT, Op, CallOptions, dl).first;
 
   // Truncate the result if the libcall returns a larger type.
   return DAG.getNode(ISD::TRUNCATE, dl, RVT, Res);
 }
 
-SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT(SDNode *N) {
-  SDValue Op1 = GetSoftenedFloat(N->getOperand(1));
-  SDValue Op2 = GetSoftenedFloat(N->getOperand(2));
-
-  if (Op1 == N->getOperand(1) && Op2 == N->getOperand(2))
-    return SDValue();
-
-  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Op1, Op2),
-                 0);
-}
-
 SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
   SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
   ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
@@ -986,7 +958,8 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
   EVT VT = NewLHS.getValueType();
   NewLHS = GetSoftenedFloat(NewLHS);
   NewRHS = GetSoftenedFloat(NewRHS);
-  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N));
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N),
+                          N->getOperand(0), N->getOperand(1));
 
   // If softenSetCCOperands returned a scalar, we need to compare the result
   // against zero to select between true and false values.
@@ -1009,7 +982,8 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_SETCC(SDNode *N) {
   EVT VT = NewLHS.getValueType();
   NewLHS = GetSoftenedFloat(NewLHS);
   NewRHS = GetSoftenedFloat(NewRHS);
-  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N));
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N),
+                          N->getOperand(0), N->getOperand(1));
 
   // If softenSetCCOperands returned a scalar, use it.
   if (!NewRHS.getNode()) {
@@ -1047,13 +1021,16 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_LROUND(SDNode *N) {
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   EVT RetVT = N->getOperand(0).getValueType().getSimpleVT().SimpleTy;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
                                            RTLIB::LROUND_F32,
                                            RTLIB::LROUND_F64,
                                            RTLIB::LROUND_F80,
                                            RTLIB::LROUND_F128,
                                            RTLIB::LROUND_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_LLROUND(SDNode *N) {
@@ -1061,13 +1038,16 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_LLROUND(SDNode *N) {
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   EVT RetVT = N->getOperand(0).getValueType().getSimpleVT().SimpleTy;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
                                            RTLIB::LLROUND_F32,
                                            RTLIB::LLROUND_F64,
                                            RTLIB::LLROUND_F80,
                                            RTLIB::LLROUND_F128,
                                            RTLIB::LLROUND_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_LRINT(SDNode *N) {
@@ -1075,13 +1055,16 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_LRINT(SDNode *N) {
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   EVT RetVT = N->getOperand(0).getValueType().getSimpleVT().SimpleTy;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
                                            RTLIB::LRINT_F32,
                                            RTLIB::LRINT_F64,
                                            RTLIB::LRINT_F80,
                                            RTLIB::LRINT_F128,
                                            RTLIB::LRINT_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::SoftenFloatOp_LLRINT(SDNode *N) {
@@ -1089,13 +1072,16 @@ SDValue DAGTypeLegalizer::SoftenFloatOp_LLRINT(SDNode *N) {
 
   SDValue Op = GetSoftenedFloat(N->getOperand(0));
   EVT RetVT = N->getOperand(0).getValueType().getSimpleVT().SimpleTy;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[1] = { N->getOperand(0).getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, N->getValueType(0), true);
   return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
                                            RTLIB::LLRINT_F32,
                                            RTLIB::LLRINT_F64,
                                            RTLIB::LLRINT_F80,
                                            RTLIB::LLRINT_F128,
                                            RTLIB::LLRINT_PPCF128),
-                         NVT, Op, false, SDLoc(N)).first;
+                         NVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1267,13 +1253,14 @@ void DAGTypeLegalizer::ExpandFloatRes_FCOS(SDNode *N,
 void DAGTypeLegalizer::ExpandFloatRes_FDIV(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  TargetLowering::MakeLibCallOptions CallOptions;
   SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                                    RTLIB::DIV_F32,
                                                    RTLIB::DIV_F64,
                                                    RTLIB::DIV_F80,
                                                    RTLIB::DIV_F128,
                                                    RTLIB::DIV_PPCF128),
-                                 N->getValueType(0), Ops, false,
+                                 N->getValueType(0), Ops, CallOptions,
                                  SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
@@ -1341,13 +1328,14 @@ void DAGTypeLegalizer::ExpandFloatRes_FLOG10(SDNode *N,
 void DAGTypeLegalizer::ExpandFloatRes_FMA(SDNode *N, SDValue &Lo,
                                           SDValue &Hi) {
   SDValue Ops[3] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
+  TargetLowering::MakeLibCallOptions CallOptions;
   SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                                    RTLIB::FMA_F32,
                                                    RTLIB::FMA_F64,
                                                    RTLIB::FMA_F80,
                                                    RTLIB::FMA_F128,
                                                    RTLIB::FMA_PPCF128),
-                                 N->getValueType(0), Ops, false,
+                                 N->getValueType(0), Ops, CallOptions,
                                  SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
@@ -1355,13 +1343,14 @@ void DAGTypeLegalizer::ExpandFloatRes_FMA(SDNode *N, SDValue &Lo,
 void DAGTypeLegalizer::ExpandFloatRes_FMUL(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  TargetLowering::MakeLibCallOptions CallOptions;
   SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                                    RTLIB::MUL_F32,
                                                    RTLIB::MUL_F64,
                                                    RTLIB::MUL_F80,
                                                    RTLIB::MUL_F128,
                                                    RTLIB::MUL_PPCF128),
-                                 N->getValueType(0), Ops, false,
+                                 N->getValueType(0), Ops, CallOptions,
                                  SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
@@ -1470,13 +1459,14 @@ void DAGTypeLegalizer::ExpandFloatRes_FSQRT(SDNode *N,
 void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  TargetLowering::MakeLibCallOptions CallOptions;
   SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
                                                    RTLIB::SUB_F32,
                                                    RTLIB::SUB_F64,
                                                    RTLIB::SUB_F80,
                                                    RTLIB::SUB_F128,
                                                    RTLIB::SUB_PPCF128),
-                                 N->getValueType(0), Ops, false,
+                                 N->getValueType(0), Ops, CallOptions,
                                  SDLoc(N)).first;
   GetPairElements(Call, Lo, Hi);
 }
@@ -1555,7 +1545,9 @@ void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
     }
     assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
 
-    Hi = TLI.makeLibCall(DAG, LC, VT, Src, true, dl).first;
+    TargetLowering::MakeLibCallOptions CallOptions;
+    CallOptions.setSExt(true);
+    Hi = TLI.makeLibCall(DAG, LC, VT, Src, CallOptions, dl).first;
     GetPairElements(Hi, Lo, Hi);
   }
 
@@ -1732,7 +1724,8 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_SINT(SDNode *N) {
 
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
-  return TLI.makeLibCall(DAG, LC, RVT, N->getOperand(0), false, dl).first;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  return TLI.makeLibCall(DAG, LC, RVT, N->getOperand(0), CallOptions, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
@@ -1741,8 +1734,9 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
 
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
+  TargetLowering::MakeLibCallOptions CallOptions;
   return TLI.makeLibCall(DAG, LC, N->getValueType(0), N->getOperand(0),
-                         false, dl).first;
+                         CallOptions, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
@@ -1807,49 +1801,53 @@ SDValue DAGTypeLegalizer::ExpandFloatOp_STORE(SDNode *N, unsigned OpNo) {
 SDValue DAGTypeLegalizer::ExpandFloatOp_LROUND(SDNode *N) {
   EVT RVT = N->getValueType(0);
   EVT RetVT = N->getOperand(0).getValueType().getSimpleVT().SimpleTy;
+  TargetLowering::MakeLibCallOptions CallOptions;
   return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
                                            RTLIB::LROUND_F32,
                                            RTLIB::LROUND_F64,
                                            RTLIB::LROUND_F80,
                                            RTLIB::LROUND_F128,
                                            RTLIB::LROUND_PPCF128),
-                         RVT, N->getOperand(0), false, SDLoc(N)).first;
+                         RVT, N->getOperand(0), CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_LLROUND(SDNode *N) {
   EVT RVT = N->getValueType(0);
   EVT RetVT = N->getOperand(0).getValueType().getSimpleVT().SimpleTy;
+  TargetLowering::MakeLibCallOptions CallOptions;
   return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
                                            RTLIB::LLROUND_F32,
                                            RTLIB::LLROUND_F64,
                                            RTLIB::LLROUND_F80,
                                            RTLIB::LLROUND_F128,
                                            RTLIB::LLROUND_PPCF128),
-                         RVT, N->getOperand(0), false, SDLoc(N)).first;
+                         RVT, N->getOperand(0), CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_LRINT(SDNode *N) {
   EVT RVT = N->getValueType(0);
   EVT RetVT = N->getOperand(0).getValueType().getSimpleVT().SimpleTy;
+  TargetLowering::MakeLibCallOptions CallOptions;
   return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
                                            RTLIB::LRINT_F32,
                                            RTLIB::LRINT_F64,
                                            RTLIB::LRINT_F80,
                                            RTLIB::LRINT_F128,
                                            RTLIB::LRINT_PPCF128),
-                         RVT, N->getOperand(0), false, SDLoc(N)).first;
+                         RVT, N->getOperand(0), CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandFloatOp_LLRINT(SDNode *N) {
   EVT RVT = N->getValueType(0);
   EVT RetVT = N->getOperand(0).getValueType().getSimpleVT().SimpleTy;
+  TargetLowering::MakeLibCallOptions CallOptions;
   return TLI.makeLibCall(DAG, GetFPLibCall(RetVT,
                                            RTLIB::LLRINT_F32,
                                            RTLIB::LLRINT_F64,
                                            RTLIB::LLRINT_F80,
                                            RTLIB::LLRINT_F128,
                                            RTLIB::LLRINT_PPCF128),
-                         RVT, N->getOperand(0), false, SDLoc(N)).first;
+                         RVT, N->getOperand(0), CallOptions, SDLoc(N)).first;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2002,6 +2000,12 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
              dbgs() << "\n");
   SDValue R = SDValue();
 
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true)) {
+    LLVM_DEBUG(dbgs() << "Node has been custom expanded, done\n");
+    return;
+  }
+
   switch (N->getOpcode()) {
     // These opcodes cannot appear if promotion of FP16 is done in the backend
     // instead of Clang
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 15ac45c37c66..d5c1b539adbd 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -100,6 +100,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
                          Res = PromoteIntRes_BUILD_VECTOR(N); break;
   case ISD::SCALAR_TO_VECTOR:
                          Res = PromoteIntRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::SPLAT_VECTOR:
+                         Res = PromoteIntRes_SPLAT_VECTOR(N); break;
   case ISD::CONCAT_VECTORS:
                          Res = PromoteIntRes_CONCAT_VECTORS(N); break;
 
@@ -112,6 +114,8 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::ZERO_EXTEND:
   case ISD::ANY_EXTEND:  Res = PromoteIntRes_INT_EXTEND(N); break;
 
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:  Res = PromoteIntRes_FP_TO_XINT(N); break;
 
@@ -148,9 +152,12 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::UADDSAT:
   case ISD::SSUBSAT:
   case ISD::USUBSAT:     Res = PromoteIntRes_ADDSUBSAT(N); break;
+
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
-  case ISD::UMULFIX:     Res = PromoteIntRes_MULFIX(N); break;
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:  Res = PromoteIntRes_MULFIX(N); break;
+
   case ISD::ABS:         Res = PromoteIntRes_ABS(N); break;
 
   case ISD::ATOMIC_LOAD:
@@ -494,7 +501,20 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
       TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NVT))
     NewOpc = ISD::FP_TO_SINT;
 
-  SDValue Res = DAG.getNode(NewOpc, dl, NVT, N->getOperand(0));
+  if (N->getOpcode() == ISD::STRICT_FP_TO_UINT &&
+      !TLI.isOperationLegal(ISD::STRICT_FP_TO_UINT, NVT) &&
+      TLI.isOperationLegalOrCustom(ISD::STRICT_FP_TO_SINT, NVT))
+    NewOpc = ISD::STRICT_FP_TO_SINT;
+
+  SDValue Res;
+  if (N->isStrictFPOpcode()) {
+    Res = DAG.getNode(NewOpc, dl, { NVT, MVT::Other }, 
+                      { N->getOperand(0), N->getOperand(1) });
+    // Legalize the chain result - switch anything that used the old chain to
+    // use the new one.
+    ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  } else
+    Res = DAG.getNode(NewOpc, dl, NVT, N->getOperand(0));
 
   // Assert that the converted value fits in the original type.  If it doesn't
   // (eg: because the value being converted is too big), then the result of the
@@ -503,7 +523,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
   // NOTE: fp-to-uint to fp-to-sint promotion guarantees zero extend. For example:
   //   before legalization: fp-to-uint16, 65534. -> 0xfffe
   //   after legalization: fp-to-sint32, 65534. -> 0x0000fffe
-  return DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ?
+  return DAG.getNode((N->getOpcode() == ISD::FP_TO_UINT ||
+                      N->getOpcode() == ISD::STRICT_FP_TO_UINT) ?
                      ISD::AssertZext : ISD::AssertSext, dl, NVT, Res,
                      DAG.getValueType(N->getValueType(0).getScalarType()));
 }
@@ -590,7 +611,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
                    N->getIndex(), N->getScale() };
   SDValue Res = DAG.getMaskedGather(DAG.getVTList(NVT, MVT::Other),
                                     N->getMemoryVT(), dl, Ops,
-                                    N->getMemOperand());
+                                    N->getMemOperand(), N->getIndexType());
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
@@ -623,48 +644,84 @@ SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_ADDSUBSAT(SDNode *N) {
-  // For promoting iN -> iM, this can be expanded by
-  // 1. ANY_EXTEND iN to iM
-  // 2. SHL by M-N
-  // 3. [US][ADD|SUB]SAT
-  // 4. L/ASHR by M-N
+  // If the promoted type is legal, we can convert this to:
+  //   1. ANY_EXTEND iN to iM
+  //   2. SHL by M-N
+  //   3. [US][ADD|SUB]SAT
+  //   4. L/ASHR by M-N
+  // Else it is more efficient to convert this to a min and a max
+  // operation in the higher precision arithmetic.
   SDLoc dl(N);
   SDValue Op1 = N->getOperand(0);
   SDValue Op2 = N->getOperand(1);
   unsigned OldBits = Op1.getScalarValueSizeInBits();
 
   unsigned Opcode = N->getOpcode();
-  unsigned ShiftOp;
-  switch (Opcode) {
-  case ISD::SADDSAT:
-  case ISD::SSUBSAT:
-    ShiftOp = ISD::SRA;
-    break;
-  case ISD::UADDSAT:
-  case ISD::USUBSAT:
-    ShiftOp = ISD::SRL;
-    break;
-  default:
-    llvm_unreachable("Expected opcode to be signed or unsigned saturation "
-                     "addition or subtraction");
-  }
-
-  SDValue Op1Promoted = GetPromotedInteger(Op1);
-  SDValue Op2Promoted = GetPromotedInteger(Op2);
 
+  SDValue Op1Promoted, Op2Promoted;
+  if (Opcode == ISD::UADDSAT || Opcode == ISD::USUBSAT) {
+    Op1Promoted = ZExtPromotedInteger(Op1);
+    Op2Promoted = ZExtPromotedInteger(Op2);
+  } else {
+    Op1Promoted = SExtPromotedInteger(Op1);
+    Op2Promoted = SExtPromotedInteger(Op2);
+  }
   EVT PromotedType = Op1Promoted.getValueType();
   unsigned NewBits = PromotedType.getScalarSizeInBits();
-  unsigned SHLAmount = NewBits - OldBits;
-  EVT SHVT = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout());
-  SDValue ShiftAmount = DAG.getConstant(SHLAmount, dl, SHVT);
-  Op1Promoted =
-      DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted, ShiftAmount);
-  Op2Promoted =
-      DAG.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount);
 
-  SDValue Result =
-      DAG.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted);
-  return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount);
+  if (TLI.isOperationLegalOrCustom(Opcode, PromotedType)) {
+    unsigned ShiftOp;
+    switch (Opcode) {
+    case ISD::SADDSAT:
+    case ISD::SSUBSAT:
+      ShiftOp = ISD::SRA;
+      break;
+    case ISD::UADDSAT:
+    case ISD::USUBSAT:
+      ShiftOp = ISD::SRL;
+      break;
+    default:
+      llvm_unreachable("Expected opcode to be signed or unsigned saturation "
+                       "addition or subtraction");
+    }
+
+    unsigned SHLAmount = NewBits - OldBits;
+    EVT SHVT = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout());
+    SDValue ShiftAmount = DAG.getConstant(SHLAmount, dl, SHVT);
+    Op1Promoted =
+        DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted, ShiftAmount);
+    Op2Promoted =
+        DAG.getNode(ISD::SHL, dl, PromotedType, Op2Promoted, ShiftAmount);
+
+    SDValue Result =
+        DAG.getNode(Opcode, dl, PromotedType, Op1Promoted, Op2Promoted);
+    return DAG.getNode(ShiftOp, dl, PromotedType, Result, ShiftAmount);
+  } else {
+    if (Opcode == ISD::USUBSAT) {
+      SDValue Max =
+          DAG.getNode(ISD::UMAX, dl, PromotedType, Op1Promoted, Op2Promoted);
+      return DAG.getNode(ISD::SUB, dl, PromotedType, Max, Op2Promoted);
+    }
+
+    if (Opcode == ISD::UADDSAT) {
+      APInt MaxVal = APInt::getAllOnesValue(OldBits).zext(NewBits);
+      SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType);
+      SDValue Add =
+          DAG.getNode(ISD::ADD, dl, PromotedType, Op1Promoted, Op2Promoted);
+      return DAG.getNode(ISD::UMIN, dl, PromotedType, Add, SatMax);
+    }
+
+    unsigned AddOp = Opcode == ISD::SADDSAT ? ISD::ADD : ISD::SUB;
+    APInt MinVal = APInt::getSignedMinValue(OldBits).sext(NewBits);
+    APInt MaxVal = APInt::getSignedMaxValue(OldBits).sext(NewBits);
+    SDValue SatMin = DAG.getConstant(MinVal, dl, PromotedType);
+    SDValue SatMax = DAG.getConstant(MaxVal, dl, PromotedType);
+    SDValue Result =
+        DAG.getNode(AddOp, dl, PromotedType, Op1Promoted, Op2Promoted);
+    Result = DAG.getNode(ISD::SMIN, dl, PromotedType, Result, SatMax);
+    Result = DAG.getNode(ISD::SMAX, dl, PromotedType, Result, SatMin);
+    return Result;
+  }
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_MULFIX(SDNode *N) {
@@ -673,6 +730,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MULFIX(SDNode *N) {
   SDValue Op1Promoted, Op2Promoted;
   bool Signed =
       N->getOpcode() == ISD::SMULFIX || N->getOpcode() == ISD::SMULFIXSAT;
+  bool Saturating =
+      N->getOpcode() == ISD::SMULFIXSAT || N->getOpcode() == ISD::UMULFIXSAT;
   if (Signed) {
     Op1Promoted = SExtPromotedInteger(N->getOperand(0));
     Op2Promoted = SExtPromotedInteger(N->getOperand(1));
@@ -685,7 +744,6 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MULFIX(SDNode *N) {
   unsigned DiffSize =
       PromotedType.getScalarSizeInBits() - OldType.getScalarSizeInBits();
 
-  bool Saturating = N->getOpcode() == ISD::SMULFIXSAT;
   if (Saturating) {
     // Promoting the operand and result values changes the saturation width,
     // which is extends the values that we clamp to on saturation. This could be
@@ -1110,6 +1168,8 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
                           Res = PromoteIntOp_INSERT_VECTOR_ELT(N, OpNo);break;
   case ISD::SCALAR_TO_VECTOR:
                           Res = PromoteIntOp_SCALAR_TO_VECTOR(N); break;
+  case ISD::SPLAT_VECTOR:
+                          Res = PromoteIntOp_SPLAT_VECTOR(N); break;
   case ISD::VSELECT:
   case ISD::SELECT:       Res = PromoteIntOp_SELECT(N, OpNo); break;
   case ISD::SELECT_CC:    Res = PromoteIntOp_SELECT_CC(N, OpNo); break;
@@ -1148,7 +1208,8 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
 
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
-  case ISD::UMULFIX: Res = PromoteIntOp_MULFIX(N); break;
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT: Res = PromoteIntOp_MULFIX(N); break;
 
   case ISD::FPOWI: Res = PromoteIntOp_FPOWI(N); break;
 
@@ -1339,6 +1400,13 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N) {
                                 GetPromotedInteger(N->getOperand(0))), 0);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntOp_SPLAT_VECTOR(SDNode *N) {
+  // Integer SPLAT_VECTOR operands are implicitly truncated, so just promote the
+  // operand in place.
+  return SDValue(
+      DAG.UpdateNodeOperands(N, GetPromotedInteger(N->getOperand(0))), 0);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
   assert(OpNo == 0 && "Only know how to promote the condition!");
   SDValue Cond = N->getOperand(0);
@@ -1454,8 +1522,12 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MGATHER(MaskedGatherSDNode *N,
     EVT DataVT = N->getValueType(0);
     NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
   } else if (OpNo == 4) {
-    // Need to sign extend the index since the bits will likely be used.
-    NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
+    // The Index
+    if (N->isIndexSigned())
+      // Need to sign extend the index since the bits will likely be used.
+      NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
+    else
+      NewOps[OpNo] = ZExtPromotedInteger(N->getOperand(OpNo));
   } else
     NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
 
@@ -1470,8 +1542,12 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
     EVT DataVT = N->getValue().getValueType();
     NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
   } else if (OpNo == 4) {
-    // Need to sign extend the index since the bits will likely be used.
-    NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
+    // The Index
+    if (N->isIndexSigned())
+      // Need to sign extend the index since the bits will likely be used.
+      NewOps[OpNo] = SExtPromotedInteger(N->getOperand(OpNo));
+    else
+      NewOps[OpNo] = ZExtPromotedInteger(N->getOperand(OpNo));
   } else
     NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
   return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
@@ -1715,7 +1791,8 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
 
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
-  case ISD::UMULFIX: ExpandIntRes_MULFIX(N, Lo, Hi); break;
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT: ExpandIntRes_MULFIX(N, Lo, Hi); break;
 
   case ISD::VECREDUCE_ADD:
   case ISD::VECREDUCE_MUL:
@@ -2473,7 +2550,9 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
 
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!");
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, true/*irrelevant*/, dl).first,
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, CallOptions, dl).first,
                Lo, Hi);
 }
 
@@ -2488,7 +2567,8 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
 
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!");
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, false/*irrelevant*/, dl).first,
+  TargetLowering::MakeLibCallOptions CallOptions;
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, CallOptions, dl).first,
                Lo, Hi);
 }
 
@@ -2514,7 +2594,9 @@ void DAGTypeLegalizer::ExpandIntRes_LLROUND(SDNode *N, SDValue &Lo,
 
   SDLoc dl(N);
   EVT RetVT = N->getValueType(0);
-  SplitInteger(TLI.makeLibCall(DAG, LC, RetVT, Op, true/*irrelevant*/, dl).first,
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  SplitInteger(TLI.makeLibCall(DAG, LC, RetVT, Op, CallOptions, dl).first,
                Lo, Hi);
 }
 
@@ -2540,7 +2622,9 @@ void DAGTypeLegalizer::ExpandIntRes_LLRINT(SDNode *N, SDValue &Lo,
 
   SDLoc dl(N);
   EVT RetVT = N->getValueType(0);
-  SplitInteger(TLI.makeLibCall(DAG, LC, RetVT, Op, true/*irrelevant*/, dl).first,
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  SplitInteger(TLI.makeLibCall(DAG, LC, RetVT, Op, CallOptions, dl).first,
                Lo, Hi);
 }
 
@@ -2743,7 +2827,9 @@ void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
   }
 
   SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true/*irrelevant*/, dl).first,
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first,
                Lo, Hi);
 }
 
@@ -2777,38 +2863,53 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
   uint64_t Scale = N->getConstantOperandVal(2);
-  bool Saturating = N->getOpcode() == ISD::SMULFIXSAT;
-  EVT BoolVT = getSetCCResultType(VT);
-  SDValue Zero = DAG.getConstant(0, dl, VT);
+  bool Saturating = (N->getOpcode() == ISD::SMULFIXSAT ||
+                     N->getOpcode() == ISD::UMULFIXSAT);
+  bool Signed = (N->getOpcode() == ISD::SMULFIX ||
+                 N->getOpcode() == ISD::SMULFIXSAT);
+
+  // Handle special case when scale is equal to zero.
   if (!Scale) {
     SDValue Result;
     if (!Saturating) {
       Result = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
     } else {
-      Result = DAG.getNode(ISD::SMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
+      EVT BoolVT = getSetCCResultType(VT);
+      unsigned MulOp = Signed ? ISD::SMULO : ISD::UMULO;
+      Result = DAG.getNode(MulOp, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
       SDValue Product = Result.getValue(0);
       SDValue Overflow = Result.getValue(1);
-
-      APInt MinVal = APInt::getSignedMinValue(VTSize);
-      APInt MaxVal = APInt::getSignedMaxValue(VTSize);
-      SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
-      SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
-      SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
-      Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
-      Result = DAG.getSelect(dl, VT, Overflow, Result, Product);
+      if (Signed) {
+        APInt MinVal = APInt::getSignedMinValue(VTSize);
+        APInt MaxVal = APInt::getSignedMaxValue(VTSize);
+        SDValue SatMin = DAG.getConstant(MinVal, dl, VT);
+        SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+        SDValue Zero = DAG.getConstant(0, dl, VT);
+        SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
+        Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
+        Result = DAG.getSelect(dl, VT, Overflow, Result, Product);
+      } else {
+        // For unsigned multiplication, we only need to check the max since we
+        // can't really overflow towards zero.
+        APInt MaxVal = APInt::getMaxValue(VTSize);
+        SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+        Result = DAG.getSelect(dl, VT, Overflow, SatMax, Product);
+      }
     }
     SplitInteger(Result, Lo, Hi);
     return;
   }
 
+  // For SMULFIX[SAT] we only expect to find Scale<VTSize, but this assert will
+  // cover for unhandled cases below, while still being valid for UMULFIX[SAT].
+  assert(Scale <= VTSize && "Scale can't be larger than the value type size.");
+
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDValue LL, LH, RL, RH;
   GetExpandedInteger(LHS, LL, LH);
   GetExpandedInteger(RHS, RL, RH);
   SmallVector<SDValue, 4> Result;
 
-  bool Signed = (N->getOpcode() == ISD::SMULFIX ||
-                 N->getOpcode() == ISD::SMULFIXSAT);
   unsigned LoHiOp = Signed ? ISD::SMUL_LOHI : ISD::UMUL_LOHI;
   if (!TLI.expandMUL_LOHI(LoHiOp, VT, dl, LHS, RHS, Result, NVT, DAG,
                           TargetLowering::MulExpansionKind::OnlyLegalOrCustom,
@@ -2822,19 +2923,9 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
                                     "the size of the current value type");
   EVT ShiftTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
 
-  // Shift whole amount by scale.
-  SDValue ResultLL = Result[0];
-  SDValue ResultLH = Result[1];
-  SDValue ResultHL = Result[2];
-  SDValue ResultHH = Result[3];
-
-  SDValue SatMax, SatMin;
-  SDValue NVTZero = DAG.getConstant(0, dl, NVT);
-  SDValue NVTNeg1 = DAG.getConstant(-1, dl, NVT);
-  EVT BoolNVT = getSetCCResultType(NVT);
-
-  // After getting the multplication result in 4 parts, we need to perform a
+  // After getting the multiplication result in 4 parts, we need to perform a
   // shift right by the amount of the scale to get the result in that scale.
+  //
   // Let's say we multiply 2 64 bit numbers. The resulting value can be held in
   // 128 bits that are cut into 4 32-bit parts:
   //
@@ -2846,123 +2937,135 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
   //
   //                             |NVTSize-|
   //
-  // The resulting Lo and Hi will only need to be one of these 32-bit parts
-  // after shifting.
+  // The resulting Lo and Hi would normally be in LL and LH after the shift. But
+  // to avoid unneccessary shifting of all 4 parts, we can adjust the shift
+  // amount and get Lo and Hi using two funnel shifts. Or for the special case
+  // when Scale is a multiple of NVTSize we can just pick the result without
+  // shifting.
+  uint64_t Part0 = Scale / NVTSize; // Part holding lowest bit needed.
+  if (Scale % NVTSize) {
+    SDValue ShiftAmount = DAG.getConstant(Scale % NVTSize, dl, ShiftTy);
+    Lo = DAG.getNode(ISD::FSHR, dl, NVT, Result[Part0 + 1], Result[Part0],
+                     ShiftAmount);
+    Hi = DAG.getNode(ISD::FSHR, dl, NVT, Result[Part0 + 2], Result[Part0 + 1],
+                     ShiftAmount);
+  } else {
+    Lo = Result[Part0];
+    Hi = Result[Part0 + 1];
+  }
+
+  // Unless saturation is requested we are done. The result is in <Hi,Lo>.
+  if (!Saturating)
+    return;
+
+  // Can not overflow when there is no integer part.
+  if (Scale == VTSize)
+    return;
+
+  // To handle saturation we must check for overflow in the multiplication.
+  //
+  // Unsigned overflow happened if the upper (VTSize - Scale) bits (of Result)
+  // aren't all zeroes.
+  //
+  // Signed overflow happened if the upper (VTSize - Scale + 1) bits (of Result)
+  // aren't all ones or all zeroes.
+  //
+  // We cannot overflow past HH when multiplying 2 ints of size VTSize, so the
+  // highest bit of HH determines saturation direction in the event of signed
+  // saturation.
+
+  SDValue ResultHL = Result[2];
+  SDValue ResultHH = Result[3];
+
+  SDValue SatMax, SatMin;
+  SDValue NVTZero = DAG.getConstant(0, dl, NVT);
+  SDValue NVTNeg1 = DAG.getConstant(-1, dl, NVT);
+  EVT BoolNVT = getSetCCResultType(NVT);
+
+  if (!Signed) {
+    if (Scale < NVTSize) {
+      // Overflow happened if ((HH | (HL >> Scale)) != 0).
+      SDValue HLAdjusted = DAG.getNode(ISD::SRL, dl, NVT, ResultHL,
+                                       DAG.getConstant(Scale, dl, ShiftTy));
+      SDValue Tmp = DAG.getNode(ISD::OR, dl, NVT, HLAdjusted, ResultHH);
+      SatMax = DAG.getSetCC(dl, BoolNVT, Tmp, NVTZero, ISD::SETNE);
+    } else if (Scale == NVTSize) {
+      // Overflow happened if (HH != 0).
+      SatMax = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETNE);
+    } else if (Scale < VTSize) {
+      // Overflow happened if ((HH >> (Scale - NVTSize)) != 0).
+      SDValue HLAdjusted = DAG.getNode(ISD::SRL, dl, NVT, ResultHL,
+                                       DAG.getConstant(Scale - NVTSize, dl,
+                                                       ShiftTy));
+      SatMax = DAG.getSetCC(dl, BoolNVT, HLAdjusted, NVTZero, ISD::SETNE);
+    } else
+      llvm_unreachable("Scale must be less or equal to VTSize for UMULFIXSAT"
+                       "(and saturation can't happen with Scale==VTSize).");
+
+    Hi = DAG.getSelect(dl, NVT, SatMax, NVTNeg1, Hi);
+    Lo = DAG.getSelect(dl, NVT, SatMax, NVTNeg1, Lo);
+    return;
+  }
+
   if (Scale < NVTSize) {
-    // If the scale is less than the size of the VT we expand to, the Hi and
-    // Lo of the result will be in the first 2 parts of the result after
-    // shifting right. This only requires shifting by the scale as far as the
-    // third part in the result (ResultHL).
-    SDValue SRLAmnt = DAG.getConstant(Scale, dl, ShiftTy);
-    SDValue SHLAmnt = DAG.getConstant(NVTSize - Scale, dl, ShiftTy);
-    Lo = DAG.getNode(ISD::SRL, dl, NVT, ResultLL, SRLAmnt);
-    Lo = DAG.getNode(ISD::OR, dl, NVT, Lo,
-                     DAG.getNode(ISD::SHL, dl, NVT, ResultLH, SHLAmnt));
-    Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultLH, SRLAmnt);
-    Hi = DAG.getNode(ISD::OR, dl, NVT, Hi,
-                     DAG.getNode(ISD::SHL, dl, NVT, ResultHL, SHLAmnt));
-
-    // We cannot overflow past HH when multiplying 2 ints of size VTSize, so the
-    // highest bit of HH determines saturation direction in the event of
-    // saturation.
     // The number of overflow bits we can check are VTSize - Scale + 1 (we
     // include the sign bit). If these top bits are > 0, then we overflowed past
     // the max value. If these top bits are < -1, then we overflowed past the
     // min value. Otherwise, we did not overflow.
-    if (Saturating) {
-      unsigned OverflowBits = VTSize - Scale + 1;
-      assert(OverflowBits <= VTSize && OverflowBits > NVTSize &&
-             "Extent of overflow bits must start within HL");
-      SDValue HLHiMask = DAG.getConstant(
-          APInt::getHighBitsSet(NVTSize, OverflowBits - NVTSize), dl, NVT);
-      SDValue HLLoMask = DAG.getConstant(
-          APInt::getLowBitsSet(NVTSize, VTSize - OverflowBits), dl, NVT);
-
-      // HH > 0 or HH == 0 && HL > HLLoMask
-      SDValue HHPos = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETGT);
-      SDValue HHZero = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETEQ);
-      SDValue HLPos =
-          DAG.getSetCC(dl, BoolNVT, ResultHL, HLLoMask, ISD::SETUGT);
-      SatMax = DAG.getNode(ISD::OR, dl, BoolNVT, HHPos,
-                           DAG.getNode(ISD::AND, dl, BoolNVT, HHZero, HLPos));
-
-      // HH < -1 or HH == -1 && HL < HLHiMask
-      SDValue HHNeg = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETLT);
-      SDValue HHNeg1 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETEQ);
-      SDValue HLNeg =
-          DAG.getSetCC(dl, BoolNVT, ResultHL, HLHiMask, ISD::SETULT);
-      SatMin = DAG.getNode(ISD::OR, dl, BoolNVT, HHNeg,
-                           DAG.getNode(ISD::AND, dl, BoolNVT, HHNeg1, HLNeg));
-    }
+    unsigned OverflowBits = VTSize - Scale + 1;
+    assert(OverflowBits <= VTSize && OverflowBits > NVTSize &&
+           "Extent of overflow bits must start within HL");
+    SDValue HLHiMask = DAG.getConstant(
+        APInt::getHighBitsSet(NVTSize, OverflowBits - NVTSize), dl, NVT);
+    SDValue HLLoMask = DAG.getConstant(
+        APInt::getLowBitsSet(NVTSize, VTSize - OverflowBits), dl, NVT);
+    // We overflow max if HH > 0 or (HH == 0 && HL > HLLoMask).
+    SDValue HHGT0 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETGT);
+    SDValue HHEQ0 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETEQ);
+    SDValue HLUGT = DAG.getSetCC(dl, BoolNVT, ResultHL, HLLoMask, ISD::SETUGT);
+    SatMax = DAG.getNode(ISD::OR, dl, BoolNVT, HHGT0,
+                         DAG.getNode(ISD::AND, dl, BoolNVT, HHEQ0, HLUGT));
+    // We overflow min if HH < -1 or (HH == -1 && HL < HLHiMask).
+    SDValue HHLT = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETLT);
+    SDValue HHEQ = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETEQ);
+    SDValue HLULT = DAG.getSetCC(dl, BoolNVT, ResultHL, HLHiMask, ISD::SETULT);
+    SatMin = DAG.getNode(ISD::OR, dl, BoolNVT, HHLT,
+                         DAG.getNode(ISD::AND, dl, BoolNVT, HHEQ, HLULT));
   } else if (Scale == NVTSize) {
-    // If the scales are equal, Lo and Hi are ResultLH and Result HL,
-    // respectively. Avoid shifting to prevent undefined behavior.
-    Lo = ResultLH;
-    Hi = ResultHL;
-
-    // We overflow max if HH > 0 or HH == 0 && HL sign bit is 1.
-    // We overflow min if HH < -1 or HH == -1 && HL sign bit is 0.
-    if (Saturating) {
-      SDValue HHPos = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETGT);
-      SDValue HHZero = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETEQ);
-      SDValue HLNeg = DAG.getSetCC(dl, BoolNVT, ResultHL, NVTZero, ISD::SETLT);
-      SatMax = DAG.getNode(ISD::OR, dl, BoolNVT, HHPos,
-                           DAG.getNode(ISD::AND, dl, BoolNVT, HHZero, HLNeg));
-
-      SDValue HHNeg = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETLT);
-      SDValue HHNeg1 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETEQ);
-      SDValue HLPos = DAG.getSetCC(dl, BoolNVT, ResultHL, NVTZero, ISD::SETGE);
-      SatMin = DAG.getNode(ISD::OR, dl, BoolNVT, HHNeg,
-                           DAG.getNode(ISD::AND, dl, BoolNVT, HHNeg1, HLPos));
-    }
+    // We overflow max if HH > 0 or (HH == 0 && HL sign bit is 1).
+    SDValue HHGT0 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETGT);
+    SDValue HHEQ0 = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTZero, ISD::SETEQ);
+    SDValue HLNeg = DAG.getSetCC(dl, BoolNVT, ResultHL, NVTZero, ISD::SETLT);
+    SatMax = DAG.getNode(ISD::OR, dl, BoolNVT, HHGT0,
+                         DAG.getNode(ISD::AND, dl, BoolNVT, HHEQ0, HLNeg));
+    // We overflow min if HH < -1 or (HH == -1 && HL sign bit is 0).
+    SDValue HHLT = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETLT);
+    SDValue HHEQ = DAG.getSetCC(dl, BoolNVT, ResultHH, NVTNeg1, ISD::SETEQ);
+    SDValue HLPos = DAG.getSetCC(dl, BoolNVT, ResultHL, NVTZero, ISD::SETGE);
+    SatMin = DAG.getNode(ISD::OR, dl, BoolNVT, HHLT,
+                         DAG.getNode(ISD::AND, dl, BoolNVT, HHEQ, HLPos));
   } else if (Scale < VTSize) {
-    // If the scale is instead less than the old VT size, but greater than or
-    // equal to the expanded VT size, the first part of the result (ResultLL) is
-    // no longer a part of Lo because it would be scaled out anyway. Instead we
-    // can start shifting right from the fourth part (ResultHH) to the second
-    // part (ResultLH), and Result LH will be the new Lo.
-    SDValue SRLAmnt = DAG.getConstant(Scale - NVTSize, dl, ShiftTy);
-    SDValue SHLAmnt = DAG.getConstant(VTSize - Scale, dl, ShiftTy);
-    Lo = DAG.getNode(ISD::SRL, dl, NVT, ResultLH, SRLAmnt);
-    Lo = DAG.getNode(ISD::OR, dl, NVT, Lo,
-                     DAG.getNode(ISD::SHL, dl, NVT, ResultHL, SHLAmnt));
-    Hi = DAG.getNode(ISD::SRL, dl, NVT, ResultHL, SRLAmnt);
-    Hi = DAG.getNode(ISD::OR, dl, NVT, Hi,
-                     DAG.getNode(ISD::SHL, dl, NVT, ResultHH, SHLAmnt));
-
     // This is similar to the case when we saturate if Scale < NVTSize, but we
-    // only need to chech HH.
-    if (Saturating) {
-      unsigned OverflowBits = VTSize - Scale + 1;
-      SDValue HHHiMask = DAG.getConstant(
-          APInt::getHighBitsSet(NVTSize, OverflowBits), dl, NVT);
-      SDValue HHLoMask = DAG.getConstant(
-          APInt::getLowBitsSet(NVTSize, NVTSize - OverflowBits), dl, NVT);
-
-      SatMax = DAG.getSetCC(dl, BoolNVT, ResultHH, HHLoMask, ISD::SETGT);
-      SatMin = DAG.getSetCC(dl, BoolNVT, ResultHH, HHHiMask, ISD::SETLT);
-    }
-  } else if (Scale == VTSize) {
-    assert(
-        !Signed &&
-        "Only unsigned types can have a scale equal to the operand bit width");
-
-    Lo = ResultHL;
-    Hi = ResultHH;
-  } else {
-    llvm_unreachable("Expected the scale to be less than or equal to the width "
-                     "of the operands");
-  }
+    // only need to check HH.
+    unsigned OverflowBits = VTSize - Scale + 1;
+    SDValue HHHiMask = DAG.getConstant(
+        APInt::getHighBitsSet(NVTSize, OverflowBits), dl, NVT);
+    SDValue HHLoMask = DAG.getConstant(
+        APInt::getLowBitsSet(NVTSize, NVTSize - OverflowBits), dl, NVT);
+    SatMax = DAG.getSetCC(dl, BoolNVT, ResultHH, HHLoMask, ISD::SETGT);
+    SatMin = DAG.getSetCC(dl, BoolNVT, ResultHH, HHHiMask, ISD::SETLT);
+  } else
+    llvm_unreachable("Illegal scale for signed fixed point mul.");
 
-  if (Saturating) {
-    APInt LHMax = APInt::getSignedMaxValue(NVTSize);
-    APInt LLMax = APInt::getAllOnesValue(NVTSize);
-    APInt LHMin = APInt::getSignedMinValue(NVTSize);
-    Hi = DAG.getSelect(dl, NVT, SatMax, DAG.getConstant(LHMax, dl, NVT), Hi);
-    Hi = DAG.getSelect(dl, NVT, SatMin, DAG.getConstant(LHMin, dl, NVT), Hi);
-    Lo = DAG.getSelect(dl, NVT, SatMax, DAG.getConstant(LLMax, dl, NVT), Lo);
-    Lo = DAG.getSelect(dl, NVT, SatMin, NVTZero, Lo);
-  }
+  // Saturate to signed maximum.
+  APInt MaxHi = APInt::getSignedMaxValue(NVTSize);
+  APInt MaxLo = APInt::getAllOnesValue(NVTSize);
+  Hi = DAG.getSelect(dl, NVT, SatMax, DAG.getConstant(MaxHi, dl, NVT), Hi);
+  Lo = DAG.getSelect(dl, NVT, SatMax, DAG.getConstant(MaxLo, dl, NVT), Lo);
+  // Saturate to signed minimum.
+  APInt MinHi = APInt::getSignedMinValue(NVTSize);
+  Hi = DAG.getSelect(dl, NVT, SatMin, DAG.getConstant(MinHi, dl, NVT), Hi);
+  Lo = DAG.getSelect(dl, NVT, SatMin, NVTZero, Lo);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
@@ -3030,7 +3133,9 @@ void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
     LC = RTLIB::SDIV_I128;
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
 
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true, dl).first, Lo, Hi);
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
@@ -3129,7 +3234,9 @@ void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
 
   if (LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC)) {
     SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-    SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, isSigned, dl).first, Lo, Hi);
+    TargetLowering::MakeLibCallOptions CallOptions;
+    CallOptions.setSExt(isSigned);
+    SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
     return;
   }
 
@@ -3217,7 +3324,9 @@ void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
     LC = RTLIB::SREM_I128;
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
 
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true, dl).first, Lo, Hi);
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
@@ -3373,7 +3482,8 @@ void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N,
     LC = RTLIB::UDIV_I128;
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UDIV!");
 
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, false, dl).first, Lo, Hi);
+  TargetLowering::MakeLibCallOptions CallOptions;
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
@@ -3399,7 +3509,8 @@ void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
     LC = RTLIB::UREM_I128;
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UREM!");
 
-  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, false, dl).first, Lo, Hi);
+  TargetLowering::MakeLibCallOptions CallOptions;
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, CallOptions, dl).first, Lo, Hi);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N,
@@ -3759,7 +3870,9 @@ SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getSINTTOFP(Op.getValueType(), DstVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Don't know how to expand this SINT_TO_FP!");
-  return TLI.makeLibCall(DAG, LC, DstVT, Op, true, SDLoc(N)).first;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  return TLI.makeLibCall(DAG, LC, DstVT, Op, CallOptions, SDLoc(N)).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
@@ -3924,7 +4037,9 @@ SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
   RTLIB::Libcall LC = RTLIB::getUINTTOFP(SrcVT, DstVT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL &&
          "Don't know how to expand this UINT_TO_FP!");
-  return TLI.makeLibCall(DAG, LC, DstVT, Op, true, dl).first;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(true);
+  return TLI.makeLibCall(DAG, LC, DstVT, Op, CallOptions, dl).first;
 }
 
 SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) {
@@ -4033,6 +4148,23 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) {
   return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NOutVT, Op);
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_SPLAT_VECTOR(SDNode *N) {
+  SDLoc dl(N);
+
+  SDValue SplatVal = N->getOperand(0);
+
+  assert(!SplatVal.getValueType().isVector() && "Input must be a scalar");
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "Type must be promoted to a vector type");
+  EVT NOutElemVT = NOutVT.getVectorElementType();
+
+  SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutElemVT, SplatVal);
+
+  return DAG.getNode(ISD::SPLAT_VECTOR, dl, NOutVT, Op);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) {
   SDLoc dl(N);
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 14fd5be23ccb..b596c174a287 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -81,7 +81,6 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
 
     for (unsigned i = 0, e = Node.getNumValues(); i != e; ++i) {
       SDValue Res(&Node, i);
-      EVT VT = Res.getValueType();
       bool Failed = false;
       // Don't create a value in map.
       auto ResId = (ValueToIdMap.count(Res)) ? ValueToIdMap[Res] : 0;
@@ -135,17 +134,13 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
           dbgs() << "Unprocessed value in a map!";
           Failed = true;
         }
-      } else if (isTypeLegal(VT) || IgnoreNodeResults(&Node)) {
+      } else if (isTypeLegal(Res.getValueType()) || IgnoreNodeResults(&Node)) {
         if (Mapped > 1) {
           dbgs() << "Value with legal type was transformed!";
           Failed = true;
         }
       } else {
-        // If the value can be kept in HW registers, softening machinery can
-        // leave it unchanged and don't put it to any map.
-        if (Mapped == 0 &&
-            !(getTypeAction(VT) == TargetLowering::TypeSoftenFloat &&
-              isLegalInHWReg(VT))) {
+        if (Mapped == 0) {
           dbgs() << "Processed value not in any map!";
           Failed = true;
         } else if (Mapped & (Mapped - 1)) {
@@ -257,13 +252,9 @@ bool DAGTypeLegalizer::run() {
         Changed = true;
         goto NodeDone;
       case TargetLowering::TypeSoftenFloat:
-        Changed = SoftenFloatResult(N, i);
-        if (Changed)
-          goto NodeDone;
-        // If not changed, the result type should be legally in register.
-        assert(isLegalInHWReg(ResultVT) &&
-               "Unchanged SoftenFloatResult should be legal in register!");
-        goto ScanOperands;
+        SoftenFloatResult(N, i);
+        Changed = true;
+        goto NodeDone;
       case TargetLowering::TypeExpandFloat:
         ExpandFloatResult(N, i);
         Changed = true;
@@ -439,15 +430,9 @@ NodeDone:
     bool Failed = false;
 
     // Check that all result types are legal.
-    // A value type is illegal if its TypeAction is not TypeLegal,
-    // and TLI.RegClassForVT does not have a register class for this type.
-    // For example, the x86_64 target has f128 that is not TypeLegal,
-    // to have softened operators, but it also has FR128 register class to
-    // pass and return f128 values. Hence a legalized node can have f128 type.
     if (!IgnoreNodeResults(&Node))
       for (unsigned i = 0, NumVals = Node.getNumValues(); i < NumVals; ++i)
-        if (!isTypeLegal(Node.getValueType(i)) &&
-            !TLI.isTypeLegal(Node.getValueType(i))) {
+        if (!isTypeLegal(Node.getValueType(i))) {
           dbgs() << "Result type " << i << " illegal: ";
           Node.dump(&DAG);
           Failed = true;
@@ -456,8 +441,7 @@ NodeDone:
     // Check that all operand types are legal.
     for (unsigned i = 0, NumOps = Node.getNumOperands(); i < NumOps; ++i)
       if (!IgnoreNodeResults(Node.getOperand(i).getNode()) &&
-          !isTypeLegal(Node.getOperand(i).getValueType()) &&
-          !TLI.isTypeLegal(Node.getOperand(i).getValueType())) {
+          !isTypeLegal(Node.getOperand(i).getValueType())) {
         dbgs() << "Operand type " << i << " illegal: ";
         Node.getOperand(i).dump(&DAG);
         Failed = true;
@@ -713,23 +697,13 @@ void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
 }
 
 void DAGTypeLegalizer::SetSoftenedFloat(SDValue Op, SDValue Result) {
-  // f128 of x86_64 could be kept in SSE registers,
-  // but sometimes softened to i128.
-  assert((Result.getValueType() ==
-          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) ||
-          Op.getValueType() ==
-          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())) &&
+  assert(Result.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
          "Invalid type for softened float");
   AnalyzeNewValue(Result);
 
   auto &OpIdEntry = SoftenedFloats[getTableId(Op)];
-  // Allow repeated calls to save f128 type nodes
-  // or any node with type that transforms to itself.
-  // Many operations on these types are not softened.
-  assert(((OpIdEntry == 0) ||
-          Op.getValueType() ==
-              TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())) &&
-         "Node is already converted to integer!");
+  assert((OpIdEntry == 0) && "Node is already converted to integer!");
   OpIdEntry = getTableId(Result);
 }
 
@@ -1003,25 +977,27 @@ SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
 /// Convert the node into a libcall with the same prototype.
 SDValue DAGTypeLegalizer::LibCallify(RTLIB::Libcall LC, SDNode *N,
                                      bool isSigned) {
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setSExt(isSigned);
   unsigned NumOps = N->getNumOperands();
   SDLoc dl(N);
   if (NumOps == 0) {
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), None, isSigned,
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), None, CallOptions,
                            dl).first;
   } else if (NumOps == 1) {
     SDValue Op = N->getOperand(0);
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Op, isSigned,
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Op, CallOptions,
                            dl).first;
   } else if (NumOps == 2) {
     SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
-    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, isSigned,
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, CallOptions,
                            dl).first;
   }
   SmallVector<SDValue, 8> Ops(NumOps);
   for (unsigned i = 0; i < NumOps; ++i)
     Ops[i] = N->getOperand(i);
 
-  return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, isSigned, dl).first;
+  return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, CallOptions, dl).first;
 }
 
 /// Expand a node into a call to a libcall. Similar to ExpandLibCall except that
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 1d489b1b3a33..4afbae69128a 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -73,15 +73,6 @@ private:
     return VT.isSimple() && TLI.isTypeLegal(VT);
   }
 
-  /// Return true if this type can be passed in registers.
-  /// For example, x86_64's f128, should to be legally in registers
-  /// and only some operations converted to library calls or integer
-  /// bitwise operations.
-  bool isLegalInHWReg(EVT VT) const {
-    EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
-    return VT == NVT && isSimpleLegalType(VT);
-  }
-
   EVT getSetCCResultType(EVT VT) const {
     return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   }
@@ -306,6 +297,7 @@ private:
   SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N);
   SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N);
   SDValue PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue PromoteIntRes_SPLAT_VECTOR(SDNode *N);
   SDValue PromoteIntRes_EXTEND_VECTOR_INREG(SDNode *N);
   SDValue PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N);
   SDValue PromoteIntRes_CONCAT_VECTORS(SDNode *N);
@@ -363,6 +355,7 @@ private:
   SDValue PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N);
   SDValue PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue PromoteIntOp_SPLAT_VECTOR(SDNode *N);
   SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo);
   SDValue PromoteIntOp_SETCC(SDNode *N, unsigned OpNo);
@@ -472,14 +465,11 @@ private:
   // Float to Integer Conversion Support: LegalizeFloatTypes.cpp
   //===--------------------------------------------------------------------===//
 
-  /// Given an operand Op of Float type, returns the integer if the Op is not
-  /// supported in target HW and converted to the integer.
-  /// The integer contains exactly the same bits as Op - only the type changed.
-  /// For example, if Op is an f32 which was softened to an i32, then this
-  /// method returns an i32, the bits of which coincide with those of Op.
-  /// If the Op can be efficiently supported in target HW or the operand must
-  /// stay in a register, the Op is not converted to an integer.
-  /// In that case, the given op is returned.
+  /// GetSoftenedFloat - Given a processed operand Op which was converted to an
+  /// integer of the same size, this returns the integer.  The integer contains
+  /// exactly the same bits as Op - only the type changed.  For example, if Op
+  /// is an f32 which was softened to an i32, then this method returns an i32,
+  /// the bits of which coincide with those of Op
   SDValue GetSoftenedFloat(SDValue Op) {
     TableId Id = getTableId(Op);
     auto Iter = SoftenedFloats.find(Id);
@@ -494,19 +484,19 @@ private:
   }
   void SetSoftenedFloat(SDValue Op, SDValue Result);
 
-  // Convert Float Results to Integer for Non-HW-supported Operations.
-  bool SoftenFloatResult(SDNode *N, unsigned ResNo);
+  // Convert Float Results to Integer.
+  void SoftenFloatResult(SDNode *N, unsigned ResNo);
   SDValue SoftenFloatRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
-  SDValue SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_BITCAST(SDNode *N);
   SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N);
-  SDValue SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_ConstantFP(SDNode *N);
   SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N, unsigned ResNo);
-  SDValue SoftenFloatRes_FABS(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_FABS(SDNode *N);
   SDValue SoftenFloatRes_FMINNUM(SDNode *N);
   SDValue SoftenFloatRes_FMAXNUM(SDNode *N);
   SDValue SoftenFloatRes_FADD(SDNode *N);
   SDValue SoftenFloatRes_FCEIL(SDNode *N);
-  SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N);
   SDValue SoftenFloatRes_FCOS(SDNode *N);
   SDValue SoftenFloatRes_FDIV(SDNode *N);
   SDValue SoftenFloatRes_FEXP(SDNode *N);
@@ -518,7 +508,7 @@ private:
   SDValue SoftenFloatRes_FMA(SDNode *N);
   SDValue SoftenFloatRes_FMUL(SDNode *N);
   SDValue SoftenFloatRes_FNEARBYINT(SDNode *N);
-  SDValue SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_FNEG(SDNode *N);
   SDValue SoftenFloatRes_FP_EXTEND(SDNode *N);
   SDValue SoftenFloatRes_FP16_TO_FP(SDNode *N);
   SDValue SoftenFloatRes_FP_ROUND(SDNode *N);
@@ -531,27 +521,17 @@ private:
   SDValue SoftenFloatRes_FSQRT(SDNode *N);
   SDValue SoftenFloatRes_FSUB(SDNode *N);
   SDValue SoftenFloatRes_FTRUNC(SDNode *N);
-  SDValue SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo);
-  SDValue SoftenFloatRes_SELECT(SDNode *N, unsigned ResNo);
-  SDValue SoftenFloatRes_SELECT_CC(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_LOAD(SDNode *N);
+  SDValue SoftenFloatRes_SELECT(SDNode *N);
+  SDValue SoftenFloatRes_SELECT_CC(SDNode *N);
   SDValue SoftenFloatRes_UNDEF(SDNode *N);
   SDValue SoftenFloatRes_VAARG(SDNode *N);
   SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N);
 
-  // Return true if we can skip softening the given operand or SDNode because
-  // either it was soften before by SoftenFloatResult and references to the
-  // operand were replaced by ReplaceValueWith or it's value type is legal in HW
-  // registers and the operand can be left unchanged.
-  bool CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo);
-
-  // Convert Float Operand to Integer for Non-HW-supported Operations.
+  // Convert Float Operand to Integer.
   bool SoftenFloatOperand(SDNode *N, unsigned OpNo);
   SDValue SoftenFloatOp_BITCAST(SDNode *N);
-  SDValue SoftenFloatOp_COPY_TO_REG(SDNode *N);
   SDValue SoftenFloatOp_BR_CC(SDNode *N);
-  SDValue SoftenFloatOp_FABS(SDNode *N);
-  SDValue SoftenFloatOp_FCOPYSIGN(SDNode *N);
-  SDValue SoftenFloatOp_FNEG(SDNode *N);
   SDValue SoftenFloatOp_FP_EXTEND(SDNode *N);
   SDValue SoftenFloatOp_FP_ROUND(SDNode *N);
   SDValue SoftenFloatOp_FP_TO_XINT(SDNode *N);
@@ -559,7 +539,6 @@ private:
   SDValue SoftenFloatOp_LLROUND(SDNode *N);
   SDValue SoftenFloatOp_LRINT(SDNode *N);
   SDValue SoftenFloatOp_LLRINT(SDNode *N);
-  SDValue SoftenFloatOp_SELECT(SDNode *N);
   SDValue SoftenFloatOp_SELECT_CC(SDNode *N);
   SDValue SoftenFloatOp_SETCC(SDNode *N);
   SDValue SoftenFloatOp_STORE(SDNode *N, unsigned OpNo);
@@ -715,6 +694,7 @@ private:
   bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo);
   SDValue ScalarizeVecOp_BITCAST(SDNode *N);
   SDValue ScalarizeVecOp_UnaryOp(SDNode *N);
+  SDValue ScalarizeVecOp_UnaryOp_StrictFP(SDNode *N);
   SDValue ScalarizeVecOp_CONCAT_VECTORS(SDNode *N);
   SDValue ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue ScalarizeVecOp_VSELECT(SDNode *N);
@@ -830,6 +810,7 @@ private:
   SDValue WidenVecRes_Ternary(SDNode *N);
   SDValue WidenVecRes_Binary(SDNode *N);
   SDValue WidenVecRes_BinaryCanTrap(SDNode *N);
+  SDValue WidenVecRes_BinaryWithExtraScalarOp(SDNode *N);
   SDValue WidenVecRes_StrictFP(SDNode *N);
   SDValue WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo);
   SDValue WidenVecRes_Convert(SDNode *N);
@@ -933,6 +914,8 @@ private:
   void SplitRes_SELECT_CC   (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_UNDEF       (SDNode *N, SDValue &Lo, SDValue &Hi);
 
+  void SplitVSETCC(const SDNode *N);
+
   //===--------------------------------------------------------------------===//
   // Generic Expansion: LegalizeTypesGeneric.cpp
   //===--------------------------------------------------------------------===//
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 943f63f46c47..5562f400b6e1 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -52,17 +52,11 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
     case TargetLowering::TypePromoteFloat:
       llvm_unreachable("Bitcast of a promotion-needing float should never need"
                        "expansion");
-    case TargetLowering::TypeSoftenFloat: {
-      // Expand the floating point operand only if it was converted to integers.
-      // Otherwise, it is a legal type like f128 that can be saved in a register.
-      auto SoftenedOp = GetSoftenedFloat(InOp);
-      if (isLegalInHWReg(SoftenedOp.getValueType()))
-        break;
-      SplitInteger(SoftenedOp, Lo, Hi);
+    case TargetLowering::TypeSoftenFloat:
+      SplitInteger(GetSoftenedFloat(InOp), Lo, Hi);
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
       return;
-    }
     case TargetLowering::TypeExpandInteger:
     case TargetLowering::TypeExpandFloat: {
       auto &DL = DAG.getDataLayout();
@@ -509,23 +503,6 @@ void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
   GetSplitOp(Op, Lo, Hi);
 }
 
-static std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N,
-                                               SelectionDAG &DAG) {
-  SDLoc DL(N);
-  EVT LoVT, HiVT;
-  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
-
-  // Split the inputs.
-  SDValue Lo, Hi, LL, LH, RL, RH;
-  std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
-  std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
-
-  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
-  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
-
-  return std::make_pair(Lo, Hi);
-}
-
 void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDValue LL, LH, RL, RH, CL, CH;
   SDLoc dl(N);
@@ -537,16 +514,25 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) {
   if (Cond.getValueType().isVector()) {
     if (SDValue Res = WidenVSELECTAndMask(N))
       std::tie(CL, CH) = DAG.SplitVector(Res->getOperand(0), dl);
-    // It seems to improve code to generate two narrow SETCCs as opposed to
-    // splitting a wide result vector.
-    else if (Cond.getOpcode() == ISD::SETCC)
-      std::tie(CL, CH) = SplitVSETCC(Cond.getNode(), DAG);
     // Check if there are already splitted versions of the vector available and
     // use those instead of splitting the mask operand again.
     else if (getTypeAction(Cond.getValueType()) ==
              TargetLowering::TypeSplitVector)
       GetSplitVector(Cond, CL, CH);
-    else
+    // It seems to improve code to generate two narrow SETCCs as opposed to
+    // splitting a wide result vector.
+    else if (Cond.getOpcode() == ISD::SETCC) {
+      // If the condition is a vXi1 vector, and the LHS of the setcc is a legal
+      // type and the setcc result type is the same vXi1, then leave the setcc
+      // alone.
+      EVT CondLHSVT = Cond.getOperand(0).getValueType();
+      if (Cond.getValueType().getVectorElementType() == MVT::i1 &&
+          isTypeLegal(CondLHSVT) &&
+          getSetCCResultType(CondLHSVT) == Cond.getValueType())
+        std::tie(CL, CH) = DAG.SplitVector(Cond, dl);
+      else
+        SplitVecRes_SETCC(Cond.getNode(), CL, CH);
+    } else
       std::tie(CL, CH) = DAG.SplitVector(Cond, dl);
   }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 10b8b705869e..15c3a0b6cfad 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -38,6 +38,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
@@ -333,14 +334,27 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::STRICT_FFLOOR:
   case ISD::STRICT_FROUND:
   case ISD::STRICT_FTRUNC:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
   case ISD::STRICT_FP_ROUND:
   case ISD::STRICT_FP_EXTEND:
-    // These pseudo-ops get legalized as if they were their non-strict
-    // equivalent.  For instance, if ISD::FSQRT is legal then ISD::STRICT_FSQRT
-    // is also legal, but if ISD::FSQRT requires expansion then so does
-    // ISD::STRICT_FSQRT.
-    Action = TLI.getStrictFPOperationAction(Node->getOpcode(),
-                                            Node->getValueType(0));
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    // If we're asked to expand a strict vector floating-point operation,
+    // by default we're going to simply unroll it.  That is usually the
+    // best approach, except in the case where the resulting strict (scalar)
+    // operations would themselves use the fallback mutation to non-strict.
+    // In that specific case, just do the fallback on the vector op.
+    if (Action == TargetLowering::Expand &&
+        TLI.getStrictFPOperationAction(Node->getOpcode(),
+                                       Node->getValueType(0))
+        == TargetLowering::Legal) {
+      EVT EltVT = Node->getValueType(0).getVectorElementType();
+      if (TLI.getOperationAction(Node->getOpcode(), EltVT)
+          == TargetLowering::Expand &&
+          TLI.getStrictFPOperationAction(Node->getOpcode(), EltVT)
+          == TargetLowering::Legal)
+        Action = TargetLowering::Legal;
+    }
     break;
   case ISD::ADD:
   case ISD::SUB:
@@ -439,16 +453,13 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
     break;
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
-  case ISD::UMULFIX: {
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT: {
     unsigned Scale = Node->getConstantOperandVal(2);
     Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
                                               Node->getValueType(0), Scale);
     break;
   }
-  case ISD::FP_ROUND_INREG:
-    Action = TLI.getOperationAction(Node->getOpcode(),
-               cast<VTSDNode>(Node->getOperand(1))->getVT());
-    break;
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
   case ISD::VECREDUCE_ADD:
@@ -820,6 +831,13 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
   case ISD::SMULFIX:
   case ISD::UMULFIX:
     return ExpandFixedPointMul(Op);
+  case ISD::SMULFIXSAT:
+  case ISD::UMULFIXSAT:
+    // FIXME: We do not expand SMULFIXSAT/UMULFIXSAT here yet, not sure exactly
+    // why. Maybe it results in worse codegen compared to the unroll for some
+    // targets? This should probably be investigated. And if we still prefer to
+    // unroll an explanation could be helpful.
+    return DAG.UnrollVectorOp(Op.getNode());
   case ISD::STRICT_FADD:
   case ISD::STRICT_FSUB:
   case ISD::STRICT_FMUL:
@@ -844,6 +862,8 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
   case ISD::STRICT_FFLOOR:
   case ISD::STRICT_FROUND:
   case ISD::STRICT_FTRUNC:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
     return ExpandStrictFPOp(Op);
   case ISD::VECREDUCE_ADD:
   case ISD::VECREDUCE_MUL:
@@ -1168,9 +1188,13 @@ SDValue VectorLegalizer::ExpandABS(SDValue Op) {
 
 SDValue VectorLegalizer::ExpandFP_TO_UINT(SDValue Op) {
   // Attempt to expand using TargetLowering.
-  SDValue Result;
-  if (TLI.expandFP_TO_UINT(Op.getNode(), Result, DAG))
+  SDValue Result, Chain;
+  if (TLI.expandFP_TO_UINT(Op.getNode(), Result, Chain, DAG)) {
+    if (Op.getNode()->isStrictFPOpcode())
+      // Relink the chain
+      DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Chain);
     return Result;
+  }
 
   // Otherwise go ahead and unroll.
   return DAG.UnrollVectorOp(Op.getNode());
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 7e4d52617977..3763e886cef2 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -52,7 +52,6 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::STRICT_FP_ROUND:   R = ScalarizeVecRes_STRICT_FP_ROUND(N); break;
   case ISD::FP_ROUND:          R = ScalarizeVecRes_FP_ROUND(N); break;
-  case ISD::FP_ROUND_INREG:    R = ScalarizeVecRes_InregOp(N); break;
   case ISD::FPOWI:             R = ScalarizeVecRes_FPOWI(N); break;
   case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
   case ISD::LOAD:           R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
@@ -171,6 +170,8 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::STRICT_FFLOOR:
   case ISD::STRICT_FROUND:
   case ISD::STRICT_FTRUNC:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
   case ISD::STRICT_FP_EXTEND:
     R = ScalarizeVecRes_StrictFPOp(N);
     break;
@@ -185,6 +186,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:
     R = ScalarizeVecRes_MULFIX(N);
     break;
   }
@@ -604,6 +606,10 @@ bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::UINT_TO_FP:
       Res = ScalarizeVecOp_UnaryOp(N);
       break;
+    case ISD::STRICT_FP_TO_SINT:
+    case ISD::STRICT_FP_TO_UINT:
+      Res = ScalarizeVecOp_UnaryOp_StrictFP(N);
+      break;
     case ISD::CONCAT_VECTORS:
       Res = ScalarizeVecOp_CONCAT_VECTORS(N);
       break;
@@ -679,6 +685,23 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
   return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Op);
 }
 
+/// If the input is a vector that needs to be scalarized, it must be <1 x ty>.
+/// Do the strict FP operation on the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp_StrictFP(SDNode *N) {
+  assert(N->getValueType(0).getVectorNumElements() == 1 &&
+         "Unexpected vector type!");
+  SDValue Elt = GetScalarizedVector(N->getOperand(1));
+  SDValue Res = DAG.getNode(N->getOpcode(), SDLoc(N),
+                            { N->getValueType(0).getScalarType(), MVT::Other },
+                            { N->getOperand(0), Elt });
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  // Revectorize the result so the types line up with what the uses of this
+  // expression expect.
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res);
+}
+
 /// The vectors to concatenate have length one - use a BUILD_VECTOR instead.
 SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {
   SmallVector<SDValue, 8> Ops(N->getNumOperands());
@@ -828,7 +851,6 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::CONCAT_VECTORS:    SplitVecRes_CONCAT_VECTORS(N, Lo, Hi); break;
   case ISD::EXTRACT_SUBVECTOR: SplitVecRes_EXTRACT_SUBVECTOR(N, Lo, Hi); break;
   case ISD::INSERT_SUBVECTOR:  SplitVecRes_INSERT_SUBVECTOR(N, Lo, Hi); break;
-  case ISD::FP_ROUND_INREG:    SplitVecRes_InregOp(N, Lo, Hi); break;
   case ISD::FPOWI:             SplitVecRes_FPOWI(N, Lo, Hi); break;
   case ISD::FCOPYSIGN:         SplitVecRes_FCOPYSIGN(N, Lo, Hi); break;
   case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break;
@@ -883,7 +905,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FP_ROUND:
   case ISD::STRICT_FP_ROUND:
   case ISD::FP_TO_SINT:
+  case ISD::STRICT_FP_TO_SINT:
   case ISD::FP_TO_UINT:
+  case ISD::STRICT_FP_TO_UINT:
   case ISD::FRINT:
   case ISD::FROUND:
   case ISD::FSIN:
@@ -977,6 +1001,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SMULFIX:
   case ISD::SMULFIXSAT:
   case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:
     SplitVecRes_MULFIX(N, Lo, Hi);
     break;
   }
@@ -1560,10 +1585,14 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
 
   // Split Mask operand
   SDValue MaskLo, MaskHi;
-  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
-    GetSplitVector(Mask, MaskLo, MaskHi);
-  else
-    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+  if (Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+  } else {
+    if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Mask, MaskLo, MaskHi);
+    else
+      std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+  }
 
   EVT MemoryVT = MLD->getMemoryVT();
   EVT LoMemVT, HiMemVT;
@@ -1622,10 +1651,14 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
 
   // Split Mask operand
   SDValue MaskLo, MaskHi;
-  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
-    GetSplitVector(Mask, MaskLo, MaskHi);
-  else
-    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+  if (Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+  } else {
+    if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Mask, MaskLo, MaskHi);
+    else
+      std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+  }
 
   EVT MemoryVT = MGT->getMemoryVT();
   EVT LoMemVT, HiMemVT;
@@ -1651,11 +1684,11 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
 
   SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
   Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl, OpsLo,
-                           MMO);
+                           MMO, MGT->getIndexType());
 
   SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
   Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl, OpsHi,
-                           MMO);
+                           MMO, MGT->getIndexType());
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -1979,6 +2012,8 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
       break;
     case ISD::FP_TO_SINT:
     case ISD::FP_TO_UINT:
+    case ISD::STRICT_FP_TO_SINT:
+    case ISD::STRICT_FP_TO_UINT:
     case ISD::CTTZ:
     case ISD::CTLZ:
     case ISD::CTPOP:
@@ -2293,7 +2328,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
 
   SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
   SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl,
-                                   OpsLo, MMO);
+                                   OpsLo, MMO, MGT->getIndexType());
 
   MMO = DAG.getMachineFunction().
     getMachineMemOperand(MGT->getPointerInfo(),
@@ -2303,7 +2338,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
 
   SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
   SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl,
-                                   OpsHi, MMO);
+                                   OpsHi, MMO, MGT->getIndexType());
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -2340,12 +2375,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
   else
     std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
 
+  // Split Mask operand
   SDValue MaskLo, MaskHi;
-  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
-    // Split Mask operand
-    GetSplitVector(Mask, MaskLo, MaskHi);
-  else
-    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+  if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+  } else {
+    if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Mask, MaskLo, MaskHi);
+    else
+      std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+  }
 
   SDValue Lo, Hi;
   MachineMemOperand *MMO = DAG.getMachineFunction().
@@ -2397,12 +2436,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
   else
     std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
 
+  // Split Mask operand
   SDValue MaskLo, MaskHi;
-  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
-    // Split Mask operand
-    GetSplitVector(Mask, MaskLo, MaskHi);
-  else
-    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+  if (OpNo == 1 && Mask.getOpcode() == ISD::SETCC) {
+    SplitVecRes_SETCC(Mask.getNode(), MaskLo, MaskHi);
+  } else {
+    if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Mask, MaskLo, MaskHi);
+    else
+      std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+  }
 
   SDValue IndexHi, IndexLo;
   if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
@@ -2418,7 +2461,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
 
   SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Scale};
   Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
-                            DL, OpsLo, MMO);
+                            DL, OpsLo, MMO, N->getIndexType());
 
   MMO = DAG.getMachineFunction().
     getMachineMemOperand(N->getPointerInfo(),
@@ -2430,7 +2473,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
   // after another.
   SDValue OpsHi[] = {Lo, DataHi, MaskHi, Ptr, IndexHi, Scale};
   return DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
-                              DL, OpsHi, MMO);
+                              DL, OpsHi, MMO, N->getIndexType());
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
@@ -2596,7 +2639,11 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
   LoRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Lo0, Lo1, N->getOperand(2));
   HiRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Hi0, Hi1, N->getOperand(2));
   SDValue Con = DAG.getNode(ISD::CONCAT_VECTORS, DL, WideResVT, LoRes, HiRes);
-  return PromoteTargetBoolean(Con, N->getValueType(0));
+
+  EVT OpVT = N->getOperand(0).getValueType();
+  ISD::NodeType ExtendCode =
+      TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
+  return DAG.getNode(ExtendCode, DL, N->getValueType(0), Con);
 }
 
 
@@ -2663,7 +2710,6 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::BUILD_VECTOR:      Res = WidenVecRes_BUILD_VECTOR(N); break;
   case ISD::CONCAT_VECTORS:    Res = WidenVecRes_CONCAT_VECTORS(N); break;
   case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
-  case ISD::FP_ROUND_INREG:    Res = WidenVecRes_InregOp(N); break;
   case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
   case ISD::LOAD:              Res = WidenVecRes_LOAD(N); break;
   case ISD::SCALAR_TO_VECTOR:  Res = WidenVecRes_SCALAR_TO_VECTOR(N); break;
@@ -2719,6 +2765,15 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     Res = WidenVecRes_BinaryCanTrap(N);
     break;
 
+  case ISD::SMULFIX:
+  case ISD::SMULFIXSAT:
+  case ISD::UMULFIX:
+  case ISD::UMULFIXSAT:
+    // These are binary operations, but with an extra operand that shouldn't
+    // be widened (the scale).
+    Res = WidenVecRes_BinaryWithExtraScalarOp(N);
+    break;
+
   case ISD::STRICT_FADD:
   case ISD::STRICT_FSUB:
   case ISD::STRICT_FMUL:
@@ -2790,6 +2845,8 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
 
   case ISD::STRICT_FP_EXTEND:
   case ISD::STRICT_FP_ROUND:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
     Res = WidenVecRes_Convert_StrictFP(N);
     break;
 
@@ -2866,6 +2923,17 @@ SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
   return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, N->getFlags());
 }
 
+SDValue DAGTypeLegalizer::WidenVecRes_BinaryWithExtraScalarOp(SDNode *N) {
+  // Binary op widening, but with an extra operand that shouldn't be widened.
+  SDLoc dl(N);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  SDValue InOp3 = N->getOperand(2);
+  return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3,
+                     N->getFlags());
+}
+
 // Given a vector of operations that have been broken up to widen, see
 // if we can collect them together into the next widest legal VT. This
 // implementation is trap-safe.
@@ -3716,7 +3784,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) {
                     Scale };
   SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other),
                                     N->getMemoryVT(), dl, Ops,
-                                    N->getMemOperand());
+                                    N->getMemOperand(), N->getIndexType());
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
@@ -4094,7 +4162,9 @@ bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
   case ISD::FP_EXTEND:
   case ISD::STRICT_FP_EXTEND:
   case ISD::FP_TO_SINT:
+  case ISD::STRICT_FP_TO_SINT:
   case ISD::FP_TO_UINT:
+  case ISD::STRICT_FP_TO_UINT:
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
   case ISD::TRUNCATE:
@@ -4434,7 +4504,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_MGATHER(SDNode *N, unsigned OpNo) {
   SDValue Ops[] = {MG->getChain(), DataOp, Mask, MG->getBasePtr(), Index,
                    Scale};
   SDValue Res = DAG.getMaskedGather(MG->getVTList(), MG->getMemoryVT(), dl, Ops,
-                                    MG->getMemOperand());
+                                    MG->getMemOperand(), MG->getIndexType());
   ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
   ReplaceValueWith(SDValue(N, 0), Res.getValue(0));
   return SDValue();
@@ -4472,7 +4542,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) {
                    Scale};
   return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
                               MSC->getMemoryVT(), SDLoc(N), Ops,
-                              MSC->getMemOperand());
+                              MSC->getMemOperand(), MSC->getIndexType());
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
@@ -4504,7 +4574,10 @@ SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
       ISD::EXTRACT_SUBVECTOR, dl, ResVT, WideSETCC,
       DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
 
-  return PromoteTargetBoolean(CC, VT);
+  EVT OpVT = N->getOperand(0).getValueType();
+  ISD::NodeType ExtendCode =
+      TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
+  return DAG.getNode(ExtendCode, dl, VT, CC);
 }
 
 SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) {
@@ -4706,7 +4779,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
 
   int LdWidth = LdVT.getSizeInBits();
   int WidthDiff = WidenWidth - LdWidth;
-  unsigned LdAlign = LD->isVolatile() ? 0 : Align; // Allow wider loads.
+  unsigned LdAlign = (!LD->isSimple()) ? 0 : Align; // Allow wider loads.
 
   // Find the vector type that can load from.
   EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 2cb850fa1a3d..7ee44c808fcb 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -498,7 +498,7 @@ bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU,
           // Check for def of register or earlyclobber register.
           for (; NumVals; --NumVals, ++i) {
             unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
-            if (TargetRegisterInfo::isPhysicalRegister(Reg))
+            if (Register::isPhysicalRegister(Reg))
               CheckForLiveRegDef(SU, Reg, LiveRegDefs, RegAdded, LRegs, TRI);
           }
         } else
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 34b4c8502353..ff806bdb822c 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -1188,6 +1188,10 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
     if (!Pred.isArtificial())
       AddPredQueued(NewSU, Pred);
 
+  // Make sure the clone comes after the original. (InstrEmitter assumes
+  // this ordering.)
+  AddPredQueued(NewSU, SDep(SU, SDep::Artificial));
+
   // Only copy scheduled successors. Cut them from old node's successor
   // list and move them over.
   SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
@@ -1374,7 +1378,7 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
           // Check for def of register or earlyclobber register.
           for (; NumVals; --NumVals, ++i) {
             unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
-            if (TargetRegisterInfo::isPhysicalRegister(Reg))
+            if (Register::isPhysicalRegister(Reg))
               CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
           }
         } else
@@ -2358,7 +2362,7 @@ static bool hasOnlyLiveInOpers(const SUnit *SU) {
         PredSU->getNode()->getOpcode() == ISD::CopyFromReg) {
       unsigned Reg =
         cast<RegisterSDNode>(PredSU->getNode()->getOperand(1))->getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         RetVal = true;
         continue;
       }
@@ -2379,7 +2383,7 @@ static bool hasOnlyLiveOutUses(const SUnit *SU) {
     if (SuccSU->getNode() && SuccSU->getNode()->getOpcode() == ISD::CopyToReg) {
       unsigned Reg =
         cast<RegisterSDNode>(SuccSU->getNode()->getOperand(1))->getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         RetVal = true;
         continue;
       }
@@ -2948,8 +2952,8 @@ void RegReductionPQBase::PrescheduleNodesWithMultipleUses() {
     // like other nodes from the perspective of scheduling heuristics.
     if (SDNode *N = SU.getNode())
       if (N->getOpcode() == ISD::CopyToReg &&
-          TargetRegisterInfo::isVirtualRegister
-            (cast<RegisterSDNode>(N->getOperand(1))->getReg()))
+          Register::isVirtualRegister(
+              cast<RegisterSDNode>(N->getOperand(1))->getReg()))
         continue;
 
     SDNode *PredFrameSetup = nullptr;
@@ -2995,8 +2999,8 @@ void RegReductionPQBase::PrescheduleNodesWithMultipleUses() {
     // like other nodes from the perspective of scheduling heuristics.
     if (SDNode *N = SU.getNode())
       if (N->getOpcode() == ISD::CopyFromReg &&
-          TargetRegisterInfo::isVirtualRegister
-            (cast<RegisterSDNode>(N->getOperand(1))->getReg()))
+          Register::isVirtualRegister(
+              cast<RegisterSDNode>(N->getOperand(1))->getReg()))
         continue;
 
     // Perform checks on the successors of PredSU.
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 568c6191e512..d4c1fb36475e 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -115,7 +115,7 @@ static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
     return;
 
   unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
-  if (TargetRegisterInfo::isVirtualRegister(Reg))
+  if (Register::isVirtualRegister(Reg))
     return;
 
   unsigned ResNo = User->getOperand(2).getResNo();
@@ -528,7 +528,7 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
 /// are input.  This SUnit graph is similar to the SelectionDAG, but
 /// excludes nodes that aren't interesting to scheduling, and represents
 /// glued together nodes with a single SUnit.
-void ScheduleDAGSDNodes::BuildSchedGraph(AliasAnalysis *AA) {
+void ScheduleDAGSDNodes::BuildSchedGraph(AAResults *AA) {
   // Cluster certain nodes which should be scheduled together.
   ClusterNodes();
   // Populate the SUnits array.
@@ -656,7 +656,7 @@ void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use,
   if (Latency > 1 && Use->getOpcode() == ISD::CopyToReg &&
       !BB->succ_empty()) {
     unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
-    if (TargetRegisterInfo::isVirtualRegister(Reg))
+    if (Register::isVirtualRegister(Reg))
       // This copy is a liveout value. It is likely coalesced, so reduce the
       // latency so not to penalize the def.
       // FIXME: need target specific adjustment here?
@@ -808,7 +808,7 @@ EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, unsigned> &VRBaseMap,
     } else {
       // Copy from physical register.
       assert(I->getReg() && "Unknown physical register!");
-      unsigned VRBase = MRI.createVirtualRegister(SU->CopyDstRC);
+      Register VRBase = MRI.createVirtualRegister(SU->CopyDstRC);
       bool isNew = VRBaseMap.insert(std::make_pair(SU, VRBase)).second;
       (void)isNew; // Silence compiler warning.
       assert(isNew && "Node emitted out of order - early");
@@ -909,6 +909,12 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
       // Remember the source order of the inserted instruction.
       if (HasDbg)
         ProcessSourceNode(N, DAG, Emitter, VRBaseMap, Orders, Seen, NewInsn);
+
+      if (MDNode *MD = DAG->getHeapAllocSite(N)) {
+        if (NewInsn && NewInsn->isCall())
+          MF.addCodeViewHeapAllocSite(NewInsn, MD);
+      }
+
       GluedNodes.pop_back();
     }
     auto NewInsn =
@@ -917,6 +923,10 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
     if (HasDbg)
       ProcessSourceNode(SU->getNode(), DAG, Emitter, VRBaseMap, Orders, Seen,
                         NewInsn);
+    if (MDNode *MD = DAG->getHeapAllocSite(SU->getNode())) {
+      if (NewInsn && NewInsn->isCall())
+        MF.addCodeViewHeapAllocSite(NewInsn, MD);
+    }
   }
 
   // Insert all the dbg_values which have not already been inserted in source
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index 5163b4fa4fd3..183ce4b0652d 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -26,6 +26,7 @@
 
 namespace llvm {
 
+class AAResults;
 class InstrItineraryData;
 
   /// ScheduleDAGSDNodes - A ScheduleDAG for scheduling SDNode-based DAGs.
@@ -93,7 +94,7 @@ class InstrItineraryData;
     /// are input.  This SUnit graph is similar to the SelectionDAG, but
     /// excludes nodes that aren't interesting to scheduling, and represents
     /// flagged together nodes with a single SUnit.
-    void BuildSchedGraph(AliasAnalysis *AA);
+    void BuildSchedGraph(AAResults *AA);
 
     /// InitNumRegDefsLeft - Determine the # of regs defined by this node.
     ///
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
index ab06b55b49fd..e7bac73678a7 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
@@ -63,14 +63,13 @@ private:
   /// HazardRec - The hazard recognizer to use.
   ScheduleHazardRecognizer *HazardRec;
 
-  /// AA - AliasAnalysis for making memory reference queries.
-  AliasAnalysis *AA;
+  /// AA - AAResults for making memory reference queries.
+  AAResults *AA;
 
 public:
-  ScheduleDAGVLIW(MachineFunction &mf,
-                  AliasAnalysis *aa,
+  ScheduleDAGVLIW(MachineFunction &mf, AAResults *aa,
                   SchedulingPriorityQueue *availqueue)
-    : ScheduleDAGSDNodes(mf), AvailableQueue(availqueue), AA(aa) {
+      : ScheduleDAGSDNodes(mf), AvailableQueue(availqueue), AA(aa) {
     const TargetSubtargetInfo &STI = mf.getSubtarget();
     HazardRec = STI.getInstrInfo()->CreateTargetHazardRecognizer(&STI, this);
   }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 5852e693fa9f..52a71b91d93f 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -859,9 +859,8 @@ bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
     break;
   case ISD::TargetExternalSymbol: {
     ExternalSymbolSDNode *ESN = cast<ExternalSymbolSDNode>(N);
-    Erased = TargetExternalSymbols.erase(
-               std::pair<std::string,unsigned char>(ESN->getSymbol(),
-                                                    ESN->getTargetFlags()));
+    Erased = TargetExternalSymbols.erase(std::pair<std::string, unsigned>(
+        ESN->getSymbol(), ESN->getTargetFlags()));
     break;
   }
   case ISD::MCSymbol: {
@@ -1084,6 +1083,7 @@ void SelectionDAG::clear() {
   ExternalSymbols.clear();
   TargetExternalSymbols.clear();
   MCSymbols.clear();
+  SDCallSiteDbgInfo.clear();
   std::fill(CondCodeNodes.begin(), CondCodeNodes.end(),
             static_cast<CondCodeSDNode*>(nullptr));
   std::fill(ValueTypeNodes.begin(), ValueTypeNodes.end(),
@@ -1353,7 +1353,7 @@ SDValue SelectionDAG::getConstantFP(double Val, const SDLoc &DL, EVT VT,
 
 SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, const SDLoc &DL,
                                        EVT VT, int64_t Offset, bool isTargetGA,
-                                       unsigned char TargetFlags) {
+                                       unsigned TargetFlags) {
   assert((TargetFlags == 0 || isTargetGA) &&
          "Cannot set target flags on target-independent globals");
 
@@ -1400,7 +1400,7 @@ SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) {
 }
 
 SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
-                                   unsigned char TargetFlags) {
+                                   unsigned TargetFlags) {
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent jump tables");
   unsigned Opc = isTarget ? ISD::TargetJumpTable : ISD::JumpTable;
@@ -1421,7 +1421,7 @@ SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
 SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
                                       unsigned Alignment, int Offset,
                                       bool isTarget,
-                                      unsigned char TargetFlags) {
+                                      unsigned TargetFlags) {
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
   if (Alignment == 0)
@@ -1449,7 +1449,7 @@ SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
 SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
                                       unsigned Alignment, int Offset,
                                       bool isTarget,
-                                      unsigned char TargetFlags) {
+                                      unsigned TargetFlags) {
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
   if (Alignment == 0)
@@ -1473,7 +1473,7 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
 }
 
 SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset,
-                                     unsigned char TargetFlags) {
+                                     unsigned TargetFlags) {
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::TargetIndex, getVTList(VT), None);
   ID.AddInteger(Index);
@@ -1535,10 +1535,9 @@ SDValue SelectionDAG::getMCSymbol(MCSymbol *Sym, EVT VT) {
 }
 
 SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT,
-                                              unsigned char TargetFlags) {
+                                              unsigned TargetFlags) {
   SDNode *&N =
-    TargetExternalSymbols[std::pair<std::string,unsigned char>(Sym,
-                                                               TargetFlags)];
+      TargetExternalSymbols[std::pair<std::string, unsigned>(Sym, TargetFlags)];
   if (N) return SDValue(N, 0);
   N = newSDNode<ExternalSymbolSDNode>(true, Sym, TargetFlags, VT);
   InsertNode(N);
@@ -1802,9 +1801,8 @@ SDValue SelectionDAG::getLabelNode(unsigned Opcode, const SDLoc &dl,
 }
 
 SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
-                                      int64_t Offset,
-                                      bool isTarget,
-                                      unsigned char TargetFlags) {
+                                      int64_t Offset, bool isTarget,
+                                      unsigned TargetFlags) {
   unsigned Opc = isTarget ? ISD::TargetBlockAddress : ISD::BlockAddress;
 
   FoldingSetNodeID ID;
@@ -1900,20 +1898,19 @@ SDValue SelectionDAG::expandVAArg(SDNode *Node) {
   EVT VT = Node->getValueType(0);
   SDValue Tmp1 = Node->getOperand(0);
   SDValue Tmp2 = Node->getOperand(1);
-  unsigned Align = Node->getConstantOperandVal(3);
+  const MaybeAlign MA(Node->getConstantOperandVal(3));
 
   SDValue VAListLoad = getLoad(TLI.getPointerTy(getDataLayout()), dl, Tmp1,
                                Tmp2, MachinePointerInfo(V));
   SDValue VAList = VAListLoad;
 
-  if (Align > TLI.getMinStackArgumentAlignment()) {
-    assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
-
+  if (MA && *MA > TLI.getMinStackArgumentAlignment()) {
     VAList = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
-                     getConstant(Align - 1, dl, VAList.getValueType()));
+                     getConstant(MA->value() - 1, dl, VAList.getValueType()));
 
-    VAList = getNode(ISD::AND, dl, VAList.getValueType(), VAList,
-                     getConstant(-(int64_t)Align, dl, VAList.getValueType()));
+    VAList =
+        getNode(ISD::AND, dl, VAList.getValueType(), VAList,
+                getConstant(-(int64_t)MA->value(), dl, VAList.getValueType()));
   }
 
   // Increment the pointer, VAList, to the next vaarg
@@ -2154,12 +2151,9 @@ SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &DemandedBits,
   }
   case ISD::OR:
   case ISD::XOR:
-    // If the LHS or RHS don't contribute bits to the or, drop them.
-    if (MaskedValueIsZero(V.getOperand(0), DemandedBits))
-      return V.getOperand(1);
-    if (MaskedValueIsZero(V.getOperand(1), DemandedBits))
-      return V.getOperand(0);
-    break;
+  case ISD::SIGN_EXTEND_INREG:
+    return TLI->SimplifyMultipleUseDemandedBits(V, DemandedBits, DemandedElts,
+                                                *this, 0);
   case ISD::SRL:
     // Only look at single-use SRLs.
     if (!V.getNode()->hasOneUse())
@@ -2203,15 +2197,6 @@ SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &DemandedBits,
       return getNode(ISD::ANY_EXTEND, SDLoc(V), V.getValueType(), DemandedSrc);
     break;
   }
-  case ISD::SIGN_EXTEND_INREG:
-    EVT ExVT = cast<VTSDNode>(V.getOperand(1))->getVT();
-    unsigned ExVTBits = ExVT.getScalarSizeInBits();
-
-    // If none of the extended bits are demanded, eliminate the sextinreg.
-    if (DemandedBits.getActiveBits() <= ExVTBits)
-      return V.getOperand(0);
-
-    break;
   }
   return SDValue();
 }
@@ -2395,15 +2380,39 @@ SDValue SelectionDAG::getSplatValue(SDValue V) {
 /// If a SHL/SRA/SRL node has a constant or splat constant shift amount that
 /// is less than the element bit-width of the shift node, return it.
 static const APInt *getValidShiftAmountConstant(SDValue V) {
+  unsigned BitWidth = V.getScalarValueSizeInBits();
   if (ConstantSDNode *SA = isConstOrConstSplat(V.getOperand(1))) {
     // Shifting more than the bitwidth is not valid.
     const APInt &ShAmt = SA->getAPIntValue();
-    if (ShAmt.ult(V.getScalarValueSizeInBits()))
+    if (ShAmt.ult(BitWidth))
       return &ShAmt;
   }
   return nullptr;
 }
 
+/// If a SHL/SRA/SRL node has constant vector shift amounts that are all less
+/// than the element bit-width of the shift node, return the minimum value.
+static const APInt *getValidMinimumShiftAmountConstant(SDValue V) {
+  unsigned BitWidth = V.getScalarValueSizeInBits();
+  auto *BV = dyn_cast<BuildVectorSDNode>(V.getOperand(1));
+  if (!BV)
+    return nullptr;
+  const APInt *MinShAmt = nullptr;
+  for (unsigned i = 0, e = BV->getNumOperands(); i != e; ++i) {
+    auto *SA = dyn_cast<ConstantSDNode>(BV->getOperand(i));
+    if (!SA)
+      return nullptr;
+    // Shifting more than the bitwidth is not valid.
+    const APInt &ShAmt = SA->getAPIntValue();
+    if (ShAmt.uge(BitWidth))
+      return nullptr;
+    if (MinShAmt && MinShAmt->ule(ShAmt))
+      continue;
+    MinShAmt = &ShAmt;
+  }
+  return MinShAmt;
+}
+
 /// Determine which bits of Op are known to be either zero or one and return
 /// them in Known. For vectors, the known bits are those that are shared by
 /// every vector element.
@@ -2437,7 +2446,7 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     return Known;
   }
 
-  if (Depth == 6)
+  if (Depth >= MaxRecursionDepth)
     return Known;  // Limit search depth.
 
   KnownBits Known2;
@@ -2582,14 +2591,13 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     SDValue Src = Op.getOperand(0);
     ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    APInt DemandedSrc = APInt::getAllOnesValue(NumSrcElts);
     if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
       // Offset the demanded elts by the subvector index.
       uint64_t Idx = SubIdx->getZExtValue();
-      APInt DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
-      Known = computeKnownBits(Src, DemandedSrc, Depth + 1);
-    } else {
-      Known = computeKnownBits(Src, Depth + 1);
+      DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
     }
+    Known = computeKnownBits(Src, DemandedSrc, Depth + 1);
     break;
   }
   case ISD::SCALAR_TO_VECTOR: {
@@ -2800,25 +2808,9 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
       Known.One.lshrInPlace(Shift);
       // High bits are known zero.
       Known.Zero.setHighBits(Shift);
-    } else if (auto *BV = dyn_cast<BuildVectorSDNode>(Op.getOperand(1))) {
-      // If the shift amount is a vector of constants see if we can bound
-      // the number of upper zero bits.
-      unsigned ShiftAmountMin = BitWidth;
-      for (unsigned i = 0; i != BV->getNumOperands(); ++i) {
-        if (auto *C = dyn_cast<ConstantSDNode>(BV->getOperand(i))) {
-          const APInt &ShAmt = C->getAPIntValue();
-          if (ShAmt.ult(BitWidth)) {
-            ShiftAmountMin = std::min<unsigned>(ShiftAmountMin,
-                                                ShAmt.getZExtValue());
-            continue;
-          }
-        }
-        // Don't know anything.
-        ShiftAmountMin = 0;
-        break;
-      }
-
-      Known.Zero.setHighBits(ShiftAmountMin);
+    } else if (const APInt *ShMinAmt = getValidMinimumShiftAmountConstant(Op)) {
+      // Minimum shift high bits are known zero.
+      Known.Zero.setHighBits(ShMinAmt->getZExtValue());
     }
     break;
   case ISD::SRA:
@@ -3105,12 +3097,12 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
 
         // If the first operand is non-negative or has all low bits zero, then
         // the upper bits are all zero.
-        if (Known2.Zero[BitWidth-1] || ((Known2.Zero & LowBits) == LowBits))
+        if (Known2.isNonNegative() || LowBits.isSubsetOf(Known2.Zero))
           Known.Zero |= ~LowBits;
 
         // If the first operand is negative and not all low bits are zero, then
         // the upper bits are all one.
-        if (Known2.One[BitWidth-1] && ((Known2.One & LowBits) != 0))
+        if (Known2.isNegative() && LowBits.intersects(Known2.One))
           Known.One |= ~LowBits;
         assert((Known.Zero & Known.One) == 0&&"Bits known to be one AND zero?");
       }
@@ -3427,7 +3419,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     return Val.getNumSignBits();
   }
 
-  if (Depth == 6)
+  if (Depth >= MaxRecursionDepth)
     return 1;  // Limit search depth.
 
   if (!DemandedElts)
@@ -3729,6 +3721,18 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
     if (Tmp == 1) return 1;  // Early out.
     return std::min(Tmp, Tmp2)-1;
+  case ISD::MUL: {
+    // The output of the Mul can be at most twice the valid bits in the inputs.
+    unsigned SignBitsOp0 = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    if (SignBitsOp0 == 1)
+      break;
+    unsigned SignBitsOp1 = ComputeNumSignBits(Op.getOperand(1), Depth + 1);
+    if (SignBitsOp1 == 1)
+      break;
+    unsigned OutValidBits =
+        (VTBits - SignBitsOp0 + 1) + (VTBits - SignBitsOp1 + 1);
+    return OutValidBits > VTBits ? 1 : VTBits - OutValidBits + 1;
+  }
   case ISD::TRUNCATE: {
     // Check if the sign bits of source go down as far as the truncated value.
     unsigned NumSrcBits = Op.getOperand(0).getScalarValueSizeInBits();
@@ -3817,13 +3821,13 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     SDValue Src = Op.getOperand(0);
     ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    APInt DemandedSrc = APInt::getAllOnesValue(NumSrcElts);
     if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
       // Offset the demanded elts by the subvector index.
       uint64_t Idx = SubIdx->getZExtValue();
-      APInt DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
-      return ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
+      DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
     }
-    return ComputeNumSignBits(Src, Depth + 1);
+    return ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
   }
   case ISD::CONCAT_VECTORS: {
     // Determine the minimum number of sign bits across all demanded
@@ -3976,7 +3980,7 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
   if (getTarget().Options.NoNaNsFPMath || Op->getFlags().hasNoNaNs())
     return true;
 
-  if (Depth == 6)
+  if (Depth >= MaxRecursionDepth)
     return false; // Limit search depth.
 
   // TODO: Handle vectors.
@@ -4645,7 +4649,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       return getUNDEF(VT);
 
     // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
-    if ((getTarget().Options.UnsafeFPMath || Flags.hasNoSignedZeros()) &&
+    if ((getTarget().Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) &&
         OpOpcode == ISD::FSUB)
       return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1),
                      Operand.getOperand(0), Flags);
@@ -5156,22 +5160,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (N2C && N2C->isNullValue())
       return N1;
     break;
-  case ISD::FP_ROUND_INREG: {
-    EVT EVT = cast<VTSDNode>(N2)->getVT();
-    assert(VT == N1.getValueType() && "Not an inreg round!");
-    assert(VT.isFloatingPoint() && EVT.isFloatingPoint() &&
-           "Cannot FP_ROUND_INREG integer types");
-    assert(EVT.isVector() == VT.isVector() &&
-           "FP_ROUND_INREG type should be vector iff the operand "
-           "type is vector!");
-    assert((!EVT.isVector() ||
-            EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
-           "Vector element counts must match in FP_ROUND_INREG");
-    assert(EVT.bitsLE(VT) && "Not rounding down!");
-    (void)EVT;
-    if (cast<VTSDNode>(N2)->getVT() == VT) return N1;  // Not actually rounding.
-    break;
-  }
   case ISD::FP_ROUND:
     assert(VT.isFloatingPoint() &&
            N1.getValueType().isFloatingPoint() &&
@@ -5382,7 +5370,6 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       std::swap(N1, N2);
     } else {
       switch (Opcode) {
-      case ISD::FP_ROUND_INREG:
       case ISD::SIGN_EXTEND_INREG:
       case ISD::SUB:
         return getUNDEF(VT);     // fold op(undef, arg2) -> undef
@@ -5770,7 +5757,7 @@ static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
 
 static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
                                        SDValue Chain, SDValue Dst, SDValue Src,
-                                       uint64_t Size, unsigned Align,
+                                       uint64_t Size, unsigned Alignment,
                                        bool isVol, bool AlwaysInline,
                                        MachinePointerInfo DstPtrInfo,
                                        MachinePointerInfo SrcPtrInfo) {
@@ -5795,15 +5782,15 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
   if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
   unsigned SrcAlign = DAG.InferPtrAlignment(Src);
-  if (Align > SrcAlign)
-    SrcAlign = Align;
+  if (Alignment > SrcAlign)
+    SrcAlign = Alignment;
   ConstantDataArraySlice Slice;
   bool CopyFromConstant = isMemSrcFromConstant(Src, Slice);
   bool isZeroConstant = CopyFromConstant && Slice.Array == nullptr;
   unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize);
 
   if (!TLI.findOptimalMemOpLowering(
-          MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align),
+          MemOps, Limit, Size, (DstAlignCanChange ? 0 : Alignment),
           (isZeroConstant ? 0 : SrcAlign), /*IsMemset=*/false,
           /*ZeroMemset=*/false, /*MemcpyStrSrc=*/CopyFromConstant,
           /*AllowOverlap=*/!isVol, DstPtrInfo.getAddrSpace(),
@@ -5818,15 +5805,15 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
     // realignment.
     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     if (!TRI->needsStackRealignment(MF))
-      while (NewAlign > Align &&
-             DL.exceedsNaturalStackAlignment(NewAlign))
-          NewAlign /= 2;
+      while (NewAlign > Alignment &&
+             DL.exceedsNaturalStackAlignment(Align(NewAlign)))
+        NewAlign /= 2;
 
-    if (NewAlign > Align) {
+    if (NewAlign > Alignment) {
       // Give the stack frame object a larger alignment if needed.
       if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
         MFI.setObjectAlignment(FI->getIndex(), NewAlign);
-      Align = NewAlign;
+      Alignment = NewAlign;
     }
   }
 
@@ -5869,10 +5856,9 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
       }
       Value = getMemsetStringVal(VT, dl, DAG, TLI, SubSlice);
       if (Value.getNode()) {
-        Store = DAG.getStore(Chain, dl, Value,
-                             DAG.getMemBasePlusOffset(Dst, DstOff, dl),
-                             DstPtrInfo.getWithOffset(DstOff), Align,
-                             MMOFlags);
+        Store = DAG.getStore(
+            Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
+            DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags);
         OutChains.push_back(Store);
       }
     }
@@ -5900,7 +5886,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
 
       Store = DAG.getTruncStore(
           Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
-          DstPtrInfo.getWithOffset(DstOff), VT, Align, MMOFlags);
+          DstPtrInfo.getWithOffset(DstOff), VT, Alignment, MMOFlags);
       OutStoreChains.push_back(Store);
     }
     SrcOff += VTSize;
@@ -6567,7 +6553,7 @@ SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
 SDValue SelectionDAG::getMemIntrinsicNode(
     unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
     EVT MemVT, MachinePointerInfo PtrInfo, unsigned Align,
-    MachineMemOperand::Flags Flags, unsigned Size, const AAMDNodes &AAInfo) {
+    MachineMemOperand::Flags Flags, uint64_t Size, const AAMDNodes &AAInfo) {
   if (Align == 0)  // Ensure that codegen never sees alignment 0
     Align = getEVTAlignment(MemVT);
 
@@ -6619,7 +6605,9 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
     createOperands(N, Ops);
   }
   InsertNode(N);
-  return SDValue(N, 0);
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
 }
 
 SDValue SelectionDAG::getLifetimeNode(bool IsStart, const SDLoc &dl,
@@ -7022,14 +7010,15 @@ SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
 
 SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
                                       ArrayRef<SDValue> Ops,
-                                      MachineMemOperand *MMO) {
+                                      MachineMemOperand *MMO,
+                                      ISD::MemIndexType IndexType) {
   assert(Ops.size() == 6 && "Incompatible number of operands");
 
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MGATHER, VTs, Ops);
   ID.AddInteger(VT.getRawBits());
   ID.AddInteger(getSyntheticNodeSubclassData<MaskedGatherSDNode>(
-      dl.getIROrder(), VTs, VT, MMO));
+      dl.getIROrder(), VTs, VT, MMO, IndexType));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
@@ -7038,7 +7027,7 @@ SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
   }
 
   auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),
-                                          VTs, VT, MMO);
+                                          VTs, VT, MMO, IndexType);
   createOperands(N, Ops);
 
   assert(N->getPassThru().getValueType() == N->getValueType(0) &&
@@ -7062,14 +7051,15 @@ SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
 
 SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
                                        ArrayRef<SDValue> Ops,
-                                       MachineMemOperand *MMO) {
+                                       MachineMemOperand *MMO,
+                                       ISD::MemIndexType IndexType) {
   assert(Ops.size() == 6 && "Incompatible number of operands");
 
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::MSCATTER, VTs, Ops);
   ID.AddInteger(VT.getRawBits());
   ID.AddInteger(getSyntheticNodeSubclassData<MaskedScatterSDNode>(
-      dl.getIROrder(), VTs, VT, MMO));
+      dl.getIROrder(), VTs, VT, MMO, IndexType));
   ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = nullptr;
   if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
@@ -7077,7 +7067,7 @@ SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
     return SDValue(E, 0);
   }
   auto *N = newSDNode<MaskedScatterSDNode>(dl.getIROrder(), dl.getDebugLoc(),
-                                           VTs, VT, MMO);
+                                           VTs, VT, MMO, IndexType);
   createOperands(N, Ops);
 
   assert(N->getMask().getValueType().getVectorNumElements() ==
@@ -7766,16 +7756,22 @@ SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) {
   case ISD::STRICT_FLOG:       NewOpc = ISD::FLOG;       break;
   case ISD::STRICT_FLOG10:     NewOpc = ISD::FLOG10;     break;
   case ISD::STRICT_FLOG2:      NewOpc = ISD::FLOG2;      break;
+  case ISD::STRICT_LRINT:      NewOpc = ISD::LRINT;      break;
+  case ISD::STRICT_LLRINT:     NewOpc = ISD::LLRINT;     break;
   case ISD::STRICT_FRINT:      NewOpc = ISD::FRINT;      break;
   case ISD::STRICT_FNEARBYINT: NewOpc = ISD::FNEARBYINT; break;
   case ISD::STRICT_FMAXNUM:    NewOpc = ISD::FMAXNUM;    break;
   case ISD::STRICT_FMINNUM:    NewOpc = ISD::FMINNUM;    break;
   case ISD::STRICT_FCEIL:      NewOpc = ISD::FCEIL;      break;
   case ISD::STRICT_FFLOOR:     NewOpc = ISD::FFLOOR;     break;
+  case ISD::STRICT_LROUND:     NewOpc = ISD::LROUND;     break;
+  case ISD::STRICT_LLROUND:    NewOpc = ISD::LLROUND;    break;
   case ISD::STRICT_FROUND:     NewOpc = ISD::FROUND;     break;
   case ISD::STRICT_FTRUNC:     NewOpc = ISD::FTRUNC;     break;
   case ISD::STRICT_FP_ROUND:   NewOpc = ISD::FP_ROUND;   break;
   case ISD::STRICT_FP_EXTEND:  NewOpc = ISD::FP_EXTEND;  break;
+  case ISD::STRICT_FP_TO_SINT: NewOpc = ISD::FP_TO_SINT; break;
+  case ISD::STRICT_FP_TO_UINT: NewOpc = ISD::FP_TO_UINT; break;
   }
 
   assert(Node->getNumValues() == 2 && "Unexpected number of results!");
@@ -7925,6 +7921,7 @@ MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &DL,
     CSEMap.InsertNode(N, IP);
 
   InsertNode(N);
+  NewSDValueDbgMsg(SDValue(N, 0), "Creating new machine node: ", this);
   return N;
 }
 
@@ -8619,7 +8616,7 @@ SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
   // TokenFactor.
   SDValue OldChain = SDValue(OldLoad, 1);
   SDValue NewChain = SDValue(NewMemOp.getNode(), 1);
-  if (!OldLoad->hasAnyUseOfValue(1))
+  if (OldChain == NewChain || !OldLoad->hasAnyUseOfValue(1))
     return NewChain;
 
   SDValue TokenFactor =
@@ -8812,7 +8809,7 @@ HandleSDNode::~HandleSDNode() {
 GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, unsigned Order,
                                          const DebugLoc &DL,
                                          const GlobalValue *GA, EVT VT,
-                                         int64_t o, unsigned char TF)
+                                         int64_t o, unsigned TF)
     : SDNode(Opc, Order, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
   TheGlobal = GA;
 }
@@ -8986,7 +8983,7 @@ bool SDValue::reachesChainWithoutSideEffects(SDValue Dest,
 
   // Loads don't have side effects, look through them.
   if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(*this)) {
-    if (!Ld->isVolatile())
+    if (Ld->isUnordered())
       return Ld->getChain().reachesChainWithoutSideEffects(Dest, Depth-1);
   }
   return false;
@@ -9005,21 +9002,51 @@ void SDNode::intersectFlagsWith(const SDNodeFlags Flags) {
 
 SDValue
 SelectionDAG::matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
-                                  ArrayRef<ISD::NodeType> CandidateBinOps) {
+                                  ArrayRef<ISD::NodeType> CandidateBinOps,
+                                  bool AllowPartials) {
   // The pattern must end in an extract from index 0.
   if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
       !isNullConstant(Extract->getOperand(1)))
     return SDValue();
 
-  SDValue Op = Extract->getOperand(0);
-  unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
-
   // Match against one of the candidate binary ops.
+  SDValue Op = Extract->getOperand(0);
   if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
         return Op.getOpcode() == unsigned(BinOp);
       }))
     return SDValue();
 
+  // Floating-point reductions may require relaxed constraints on the final step
+  // of the reduction because they may reorder intermediate operations.
+  unsigned CandidateBinOp = Op.getOpcode();
+  if (Op.getValueType().isFloatingPoint()) {
+    SDNodeFlags Flags = Op->getFlags();
+    switch (CandidateBinOp) {
+    case ISD::FADD:
+      if (!Flags.hasNoSignedZeros() || !Flags.hasAllowReassociation())
+        return SDValue();
+      break;
+    default:
+      llvm_unreachable("Unhandled FP opcode for binop reduction");
+    }
+  }
+
+  // Matching failed - attempt to see if we did enough stages that a partial
+  // reduction from a subvector is possible.
+  auto PartialReduction = [&](SDValue Op, unsigned NumSubElts) {
+    if (!AllowPartials || !Op)
+      return SDValue();
+    EVT OpVT = Op.getValueType();
+    EVT OpSVT = OpVT.getScalarType();
+    EVT SubVT = EVT::getVectorVT(*getContext(), OpSVT, NumSubElts);
+    if (!TLI->isExtractSubvectorCheap(SubVT, OpVT, 0))
+      return SDValue();
+    BinOp = (ISD::NodeType)CandidateBinOp;
+    return getNode(
+        ISD::EXTRACT_SUBVECTOR, SDLoc(Op), SubVT, Op,
+        getConstant(0, SDLoc(Op), TLI->getVectorIdxTy(getDataLayout())));
+  };
+
   // At each stage, we're looking for something that looks like:
   // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
   //                    <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
@@ -9030,10 +9057,16 @@ SelectionDAG::matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
   // <4,5,6,7,u,u,u,u>
   // <2,3,u,u,u,u,u,u>
   // <1,u,u,u,u,u,u,u>
-  unsigned CandidateBinOp = Op.getOpcode();
+  // While a partial reduction match would be:
+  // <2,3,u,u,u,u,u,u>
+  // <1,u,u,u,u,u,u,u>
+  unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
+  SDValue PrevOp;
   for (unsigned i = 0; i < Stages; ++i) {
+    unsigned MaskEnd = (1 << i);
+
     if (Op.getOpcode() != CandidateBinOp)
-      return SDValue();
+      return PartialReduction(PrevOp, MaskEnd);
 
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
@@ -9049,12 +9082,14 @@ SelectionDAG::matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
     // The first operand of the shuffle should be the same as the other operand
     // of the binop.
     if (!Shuffle || Shuffle->getOperand(0) != Op)
-      return SDValue();
+      return PartialReduction(PrevOp, MaskEnd);
 
     // Verify the shuffle has the expected (at this stage of the pyramid) mask.
-    for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
-      if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
-        return SDValue();
+    for (int Index = 0; Index < (int)MaskEnd; ++Index)
+      if (Shuffle->getMaskElt(Index) != (int)(MaskEnd + Index))
+        return PartialReduction(PrevOp, MaskEnd);
+
+    PrevOp = Op;
   }
 
   BinOp = (ISD::NodeType)CandidateBinOp;
@@ -9114,8 +9149,7 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
                                getShiftAmountOperand(Operands[0].getValueType(),
                                                      Operands[1])));
       break;
-    case ISD::SIGN_EXTEND_INREG:
-    case ISD::FP_ROUND_INREG: {
+    case ISD::SIGN_EXTEND_INREG: {
       EVT ExtVT = cast<VTSDNode>(Operands[1])->getVT().getVectorElementType();
       Scalars.push_back(getNode(N->getOpcode(), dl, EltVT,
                                 Operands[0],
@@ -9187,6 +9221,9 @@ bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
                                                   int Dist) const {
   if (LD->isVolatile() || Base->isVolatile())
     return false;
+  // TODO: probably too restrictive for atomics, revisit
+  if (!LD->isSimple())
+    return false;
   if (LD->isIndexed() || Base->isIndexed())
     return false;
   if (LD->getChain() != Base->getChain())
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
index 9592bc30a4e1..3a53ab9717a4 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp
@@ -14,6 +14,7 @@
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
 #include <cstdint>
 
 using namespace llvm;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index e818dd27c05e..8c15563fcd23 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -833,7 +833,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
 
       // If the source register was virtual and if we know something about it,
       // add an assert node.
-      if (!TargetRegisterInfo::isVirtualRegister(Regs[Part+i]) ||
+      if (!Register::isVirtualRegister(Regs[Part + i]) ||
           !RegisterVT.isInteger())
         continue;
 
@@ -948,8 +948,7 @@ void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
   unsigned Flag = InlineAsm::getFlagWord(Code, Regs.size());
   if (HasMatching)
     Flag = InlineAsm::getFlagWordForMatchingOp(Flag, MatchingIdx);
-  else if (!Regs.empty() &&
-           TargetRegisterInfo::isVirtualRegister(Regs.front())) {
+  else if (!Regs.empty() && Register::isVirtualRegister(Regs.front())) {
     // Put the register class of the virtual registers in the flag word.  That
     // way, later passes can recompute register class constraints for inline
     // assembly as well as normal instructions.
@@ -1810,7 +1809,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
       // offsets to its parts don't wrap either.
       SDValue Ptr = DAG.getObjectPtrOffset(getCurSDLoc(), RetPtr, Offsets[i]);
 
-      SDValue Val = RetOp.getValue(i);
+      SDValue Val = RetOp.getValue(RetOp.getResNo() + i);
       if (MemVTs[i] != ValueVTs[i])
         Val = DAG.getPtrExtOrTrunc(Val, getCurSDLoc(), MemVTs[i]);
       Chains[i] = DAG.getStore(Chain, getCurSDLoc(), Val,
@@ -2263,7 +2262,7 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {
   if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(CondVal)) {
     Instruction::BinaryOps Opcode = BOp->getOpcode();
     if (!DAG.getTargetLoweringInfo().isJumpExpensive() && BOp->hasOneUse() &&
-        !I.getMetadata(LLVMContext::MD_unpredictable) &&
+        !I.hasMetadata(LLVMContext::MD_unpredictable) &&
         (Opcode == Instruction::And || Opcode == Instruction::Or)) {
       FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB,
                            Opcode,
@@ -2600,9 +2599,11 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
 void
 SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  TargetLowering::MakeLibCallOptions CallOptions;
+  CallOptions.setDiscardResult(true);
   SDValue Chain =
       TLI.makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL, MVT::isVoid,
-                      None, false, getCurSDLoc(), false, false).second;
+                      None, CallOptions, getCurSDLoc()).second;
   // On PS4, the "return address" must still be within the calling function,
   // even if it's at the very end, so emit an explicit TRAP here.
   // Passing 'true' for doesNotReturn above won't generate the trap for us.
@@ -2618,24 +2619,18 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
                                              MachineBasicBlock *SwitchBB) {
   SDLoc dl = getCurSDLoc();
 
-  // Subtract the minimum value
+  // Subtract the minimum value.
   SDValue SwitchOp = getValue(B.SValue);
   EVT VT = SwitchOp.getValueType();
-  SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SwitchOp,
-                            DAG.getConstant(B.First, dl, VT));
-
-  // Check range
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  SDValue RangeCmp = DAG.getSetCC(
-      dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
-                                 Sub.getValueType()),
-      Sub, DAG.getConstant(B.Range, dl, VT), ISD::SETUGT);
+  SDValue RangeSub =
+      DAG.getNode(ISD::SUB, dl, VT, SwitchOp, DAG.getConstant(B.First, dl, VT));
 
   // Determine the type of the test operands.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   bool UsePtrType = false;
-  if (!TLI.isTypeLegal(VT))
+  if (!TLI.isTypeLegal(VT)) {
     UsePtrType = true;
-  else {
+  } else {
     for (unsigned i = 0, e = B.Cases.size(); i != e; ++i)
       if (!isUIntN(VT.getSizeInBits(), B.Cases[i].Mask)) {
         // Switch table case range are encoded into series of masks.
@@ -2644,6 +2639,7 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
         break;
       }
   }
+  SDValue Sub = RangeSub;
   if (UsePtrType) {
     VT = TLI.getPointerTy(DAG.getDataLayout());
     Sub = DAG.getZExtOrTrunc(Sub, dl, VT);
@@ -2655,20 +2651,29 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
 
   MachineBasicBlock* MBB = B.Cases[0].ThisBB;
 
-  addSuccessorWithProb(SwitchBB, B.Default, B.DefaultProb);
+  if (!B.OmitRangeCheck)
+    addSuccessorWithProb(SwitchBB, B.Default, B.DefaultProb);
   addSuccessorWithProb(SwitchBB, MBB, B.Prob);
   SwitchBB->normalizeSuccProbs();
 
-  SDValue BrRange = DAG.getNode(ISD::BRCOND, dl,
-                                MVT::Other, CopyTo, RangeCmp,
-                                DAG.getBasicBlock(B.Default));
+  SDValue Root = CopyTo;
+  if (!B.OmitRangeCheck) {
+    // Conditional branch to the default block.
+    SDValue RangeCmp = DAG.getSetCC(dl,
+        TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                               RangeSub.getValueType()),
+        RangeSub, DAG.getConstant(B.Range, dl, RangeSub.getValueType()),
+        ISD::SETUGT);
+
+    Root = DAG.getNode(ISD::BRCOND, dl, MVT::Other, Root, RangeCmp,
+                       DAG.getBasicBlock(B.Default));
+  }
 
   // Avoid emitting unnecessary branches to the next block.
   if (MBB != NextBlock(SwitchBB))
-    BrRange = DAG.getNode(ISD::BR, dl, MVT::Other, BrRange,
-                          DAG.getBasicBlock(MBB));
+    Root = DAG.getNode(ISD::BR, dl, MVT::Other, Root, DAG.getBasicBlock(MBB));
 
-  DAG.setRoot(BrRange);
+  DAG.setRoot(Root);
 }
 
 /// visitBitTestCase - this function produces one "bit test"
@@ -3266,8 +3271,7 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
 
     // We care about the legality of the operation after it has been type
     // legalized.
-    while (TLI.getTypeAction(Ctx, VT) != TargetLoweringBase::TypeLegal &&
-           VT != TLI.getTypeToTransformTo(Ctx, VT))
+    while (TLI.getTypeAction(Ctx, VT) != TargetLoweringBase::TypeLegal)
       VT = TLI.getTypeToTransformTo(Ctx, VT);
 
     // If the vselect is legal, assume we want to leave this as a vector setcc +
@@ -3534,17 +3538,32 @@ void SelectionDAGBuilder::visitExtractElement(const User &I) {
 void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   SDValue Src1 = getValue(I.getOperand(0));
   SDValue Src2 = getValue(I.getOperand(1));
+  Constant *MaskV = cast<Constant>(I.getOperand(2));
   SDLoc DL = getCurSDLoc();
-
-  SmallVector<int, 8> Mask;
-  ShuffleVectorInst::getShuffleMask(cast<Constant>(I.getOperand(2)), Mask);
-  unsigned MaskNumElts = Mask.size();
-
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
   EVT SrcVT = Src1.getValueType();
   unsigned SrcNumElts = SrcVT.getVectorNumElements();
 
+  if (MaskV->isNullValue() && VT.isScalableVector()) {
+    // Canonical splat form of first element of first input vector.
+    SDValue FirstElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+                                   SrcVT.getScalarType(), Src1,
+                                   DAG.getConstant(0, DL, 
+                                   TLI.getVectorIdxTy(DAG.getDataLayout())));
+    setValue(&I, DAG.getNode(ISD::SPLAT_VECTOR, DL, VT, FirstElt));
+    return;
+  }
+
+  // For now, we only handle splats for scalable vectors.
+  // The DAGCombiner will perform a BUILD_VECTOR -> SPLAT_VECTOR transformation
+  // for targets that support a SPLAT_VECTOR for non-scalable vector types.
+  assert(!VT.isScalableVector() && "Unsupported scalable vector shuffle");
+
+  SmallVector<int, 8> Mask;
+  ShuffleVectorInst::getShuffleMask(MaskV, Mask);
+  unsigned MaskNumElts = Mask.size();
+
   if (SrcNumElts == MaskNumElts) {
     setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, Mask));
     return;
@@ -3825,7 +3844,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
   // Normalize Vector GEP - all scalar operands should be converted to the
   // splat vector.
   unsigned VectorWidth = I.getType()->isVectorTy() ?
-    cast<VectorType>(I.getType())->getVectorNumElements() : 0;
+    I.getType()->getVectorNumElements() : 0;
 
   if (VectorWidth && !N.getValueType().isVector()) {
     LLVMContext &Context = *DAG.getContext();
@@ -3858,12 +3877,11 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
 
       // If this is a scalar constant or a splat vector of constants,
       // handle it quickly.
-      const auto *CI = dyn_cast<ConstantInt>(Idx);
-      if (!CI && isa<ConstantDataVector>(Idx) &&
-          cast<ConstantDataVector>(Idx)->getSplatValue())
-        CI = cast<ConstantInt>(cast<ConstantDataVector>(Idx)->getSplatValue());
+      const auto *C = dyn_cast<Constant>(Idx);
+      if (C && isa<VectorType>(C->getType()))
+        C = C->getSplatValue();
 
-      if (CI) {
+      if (const auto *CI = dyn_cast_or_null<ConstantInt>(C)) {
         if (CI->isZero())
           continue;
         APInt Offs = ElementSize * CI->getValue().sextOrTrunc(IdxSize);
@@ -3872,7 +3890,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
           DAG.getConstant(Offs, dl, EVT::getVectorVT(Context, IdxTy, VectorWidth)) :
           DAG.getConstant(Offs, dl, IdxTy);
 
-        // In an inbouds GEP with an offset that is nonnegative even when
+        // In an inbounds GEP with an offset that is nonnegative even when
         // interpreted as signed, assume there is no unsigned overflow.
         SDNodeFlags Flags;
         if (Offs.isNonNegative() && cast<GEPOperator>(I).isInBounds())
@@ -4002,8 +4020,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   Type *Ty = I.getType();
 
   bool isVolatile = I.isVolatile();
-  bool isNonTemporal = I.getMetadata(LLVMContext::MD_nontemporal) != nullptr;
-  bool isInvariant = I.getMetadata(LLVMContext::MD_invariant_load) != nullptr;
+  bool isNonTemporal = I.hasMetadata(LLVMContext::MD_nontemporal);
+  bool isInvariant = I.hasMetadata(LLVMContext::MD_invariant_load);
   bool isDereferenceable =
       isDereferenceablePointer(SV, I.getType(), DAG.getDataLayout());
   unsigned Alignment = I.getAlignment();
@@ -4118,7 +4136,7 @@ void SelectionDAGBuilder::visitStoreToSwiftError(const StoreInst &I) {
 
   SDValue Src = getValue(SrcV);
   // Create a virtual register, then update the virtual register.
-  unsigned VReg =
+  Register VReg =
       SwiftError.getOrCreateVRegDefAt(&I, FuncInfo.MBB, I.getPointerOperand());
   // Chain, DL, Reg, N or Chain, DL, Reg, N, Glue
   // Chain can be getRoot or getControlRoot.
@@ -4132,8 +4150,8 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
          "call visitLoadFromSwiftError when backend supports swifterror");
 
   assert(!I.isVolatile() &&
-         I.getMetadata(LLVMContext::MD_nontemporal) == nullptr &&
-         I.getMetadata(LLVMContext::MD_invariant_load) == nullptr &&
+         !I.hasMetadata(LLVMContext::MD_nontemporal) &&
+         !I.hasMetadata(LLVMContext::MD_invariant_load) &&
          "Support volatile, non temporal, invariant for load_from_swift_error");
 
   const Value *SV = I.getOperand(0);
@@ -4209,7 +4227,7 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
   auto MMOFlags = MachineMemOperand::MONone;
   if (I.isVolatile())
     MMOFlags |= MachineMemOperand::MOVolatile;
-  if (I.getMetadata(LLVMContext::MD_nontemporal) != nullptr)
+  if (I.hasMetadata(LLVMContext::MD_nontemporal))
     MMOFlags |= MachineMemOperand::MONonTemporal;
   MMOFlags |= TLI.getMMOFlags(I);
 
@@ -4309,8 +4327,9 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
 // are looking for. If first operand of the GEP is a splat vector - we
 // extract the splat value and use it as a uniform base.
 // In all other cases the function returns 'false'.
-static bool getUniformBase(const Value* &Ptr, SDValue& Base, SDValue& Index,
-                           SDValue &Scale, SelectionDAGBuilder* SDB) {
+static bool getUniformBase(const Value *&Ptr, SDValue &Base, SDValue &Index,
+                           ISD::MemIndexType &IndexType, SDValue &Scale,
+                           SelectionDAGBuilder *SDB) {
   SelectionDAG& DAG = SDB->DAG;
   LLVMContext &Context = *DAG.getContext();
 
@@ -4330,8 +4349,13 @@ static bool getUniformBase(const Value* &Ptr, SDValue& Base, SDValue& Index,
 
   // Ensure all the other indices are 0.
   for (unsigned i = 1; i < FinalIndex; ++i) {
-    auto *C = dyn_cast<ConstantInt>(GEP->getOperand(i));
-    if (!C || !C->isZero())
+    auto *C = dyn_cast<Constant>(GEP->getOperand(i));
+    if (!C)
+      return false;
+    if (isa<VectorType>(C->getType()))
+      C = C->getSplatValue();
+    auto *CI = dyn_cast_or_null<ConstantInt>(C);
+    if (!CI || !CI->isZero())
       return false;
   }
 
@@ -4346,6 +4370,7 @@ static bool getUniformBase(const Value* &Ptr, SDValue& Base, SDValue& Index,
                                 SDB->getCurSDLoc(), TLI.getPointerTy(DL));
   Base = SDB->getValue(Ptr);
   Index = SDB->getValue(IndexVal);
+  IndexType = ISD::SIGNED_SCALED;
 
   if (!Index.getValueType().isVector()) {
     unsigned GEPWidth = GEP->getType()->getVectorNumElements();
@@ -4373,9 +4398,11 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
 
   SDValue Base;
   SDValue Index;
+  ISD::MemIndexType IndexType;
   SDValue Scale;
   const Value *BasePtr = Ptr;
-  bool UniformBase = getUniformBase(BasePtr, Base, Index, Scale, this);
+  bool UniformBase = getUniformBase(BasePtr, Base, Index, IndexType, Scale,
+                                    this);
 
   const Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr;
   MachineMemOperand *MMO = DAG.getMachineFunction().
@@ -4385,11 +4412,12 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
+    IndexType = ISD::SIGNED_SCALED;
     Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
   SDValue Ops[] = { getRoot(), Src0, Mask, Base, Index, Scale };
   SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), VT, sdl,
-                                         Ops, MMO);
+                                         Ops, MMO, IndexType);
   DAG.setRoot(Scatter);
   setValue(&I, Scatter);
 }
@@ -4476,9 +4504,11 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   SDValue Root = DAG.getRoot();
   SDValue Base;
   SDValue Index;
+  ISD::MemIndexType IndexType;
   SDValue Scale;
   const Value *BasePtr = Ptr;
-  bool UniformBase = getUniformBase(BasePtr, Base, Index, Scale, this);
+  bool UniformBase = getUniformBase(BasePtr, Base, Index, IndexType, Scale,
+                                    this);
   bool ConstantMemory = false;
   if (UniformBase && AA &&
       AA->pointsToConstantMemory(
@@ -4500,11 +4530,12 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
+    IndexType = ISD::SIGNED_SCALED;
     Scale = DAG.getTargetConstant(1, sdl, TLI.getPointerTy(DAG.getDataLayout()));
   }
   SDValue Ops[] = { Root, Src0, Mask, Base, Index, Scale };
   SDValue Gather = DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl,
-                                       Ops, MMO);
+                                       Ops, MMO, IndexType);
 
   SDValue OutChain = Gather.getValue(1);
   if (!ConstantMemory)
@@ -4628,7 +4659,7 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
   auto Flags = MachineMemOperand::MOLoad;
   if (I.isVolatile())
     Flags |= MachineMemOperand::MOVolatile;
-  if (I.getMetadata(LLVMContext::MD_invariant_load) != nullptr)
+  if (I.hasMetadata(LLVMContext::MD_invariant_load))
     Flags |= MachineMemOperand::MOInvariant;
   if (isDereferenceablePointer(I.getPointerOperand(), I.getType(),
                                DAG.getDataLayout()))
@@ -4645,9 +4676,27 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
                            AAMDNodes(), nullptr, SSID, Order);
 
   InChain = TLI.prepareVolatileOrAtomicLoad(InChain, dl, DAG);
-  SDValue L =
-      DAG.getAtomic(ISD::ATOMIC_LOAD, dl, MemVT, MemVT, InChain,
-                    getValue(I.getPointerOperand()), MMO);
+
+  SDValue Ptr = getValue(I.getPointerOperand());
+
+  if (TLI.lowerAtomicLoadAsLoadSDNode(I)) {
+    // TODO: Once this is better exercised by tests, it should be merged with
+    // the normal path for loads to prevent future divergence.
+    SDValue L = DAG.getLoad(MemVT, dl, InChain, Ptr, MMO);
+    if (MemVT != VT)
+      L = DAG.getPtrExtOrTrunc(L, dl, VT);
+
+    setValue(&I, L);
+    SDValue OutChain = L.getValue(1);
+    if (!I.isUnordered())
+      DAG.setRoot(OutChain);
+    else
+      PendingLoads.push_back(OutChain);
+    return;
+  }
+  
+  SDValue L = DAG.getAtomic(ISD::ATOMIC_LOAD, dl, MemVT, MemVT, InChain,
+                            Ptr, MMO);
 
   SDValue OutChain = L.getValue(1);
   if (MemVT != VT)
@@ -4686,9 +4735,17 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
   SDValue Val = getValue(I.getValueOperand());
   if (Val.getValueType() != MemVT)
     Val = DAG.getPtrExtOrTrunc(Val, dl, MemVT);
+  SDValue Ptr = getValue(I.getPointerOperand());
 
+  if (TLI.lowerAtomicStoreAsStoreSDNode(I)) {
+    // TODO: Once this is better exercised by tests, it should be merged with
+    // the normal path for stores to prevent future divergence.
+    SDValue S = DAG.getStore(InChain, dl, Val, Ptr, MMO);
+    DAG.setRoot(S);
+    return;
+  }
   SDValue OutChain = DAG.getAtomic(ISD::ATOMIC_STORE, dl, MemVT, InChain,
-                                   getValue(I.getPointerOperand()), Val, MMO);
+                                   Ptr, Val, MMO);
 
 
   DAG.setRoot(OutChain);
@@ -4731,8 +4788,22 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
 
   // Add all operands of the call to the operand list.
   for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
-    SDValue Op = getValue(I.getArgOperand(i));
-    Ops.push_back(Op);
+    const Value *Arg = I.getArgOperand(i);
+    if (!I.paramHasAttr(i, Attribute::ImmArg)) {
+      Ops.push_back(getValue(Arg));
+      continue;
+    }
+
+    // Use TargetConstant instead of a regular constant for immarg.
+    EVT VT = TLI.getValueType(*DL, Arg->getType(), true);
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(Arg)) {
+      assert(CI->getBitWidth() <= 64 &&
+             "large intrinsic immediates not handled");
+      Ops.push_back(DAG.getTargetConstant(*CI, SDLoc(), VT));
+    } else {
+      Ops.push_back(
+          DAG.getTargetConstantFP(*cast<ConstantFP>(Arg), SDLoc(), VT));
+    }
   }
 
   SmallVector<EVT, 4> ValueVTs;
@@ -4749,10 +4820,10 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     // This is target intrinsic that touches memory
     AAMDNodes AAInfo;
     I.getAAMetadata(AAInfo);
-    Result =
-        DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops, Info.memVT,
-                                MachinePointerInfo(Info.ptrVal, Info.offset),
-                                Info.align, Info.flags, Info.size, AAInfo);
+    Result = DAG.getMemIntrinsicNode(
+        Info.opc, getCurSDLoc(), VTs, Ops, Info.memVT,
+        MachinePointerInfo(Info.ptrVal, Info.offset),
+        Info.align ? Info.align->value() : 0, Info.flags, Info.size, AAInfo);
   } else if (!HasChain) {
     Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
   } else if (!I.getType()->isVoidTy()) {
@@ -4918,12 +4989,11 @@ static SDValue expandExp(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
     // Put the exponent in the right bit position for later addition to the
     // final result:
     //
-    //   #define LOG2OFe 1.4426950f
-    //   t0 = Op * LOG2OFe
+    // t0 = Op * log2(e)
 
     // TODO: What fast-math-flags should be set here?
     SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, Op,
-                             getF32Constant(DAG, 0x3fb8aa3b, dl));
+                             DAG.getConstantFP(numbers::log2ef, dl, MVT::f32));
     return getLimitedPrecisionExp2(t0, dl, DAG);
   }
 
@@ -4941,10 +5011,11 @@ static SDValue expandLog(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
       LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
     SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
 
-    // Scale the exponent by log(2) [0.69314718f].
+    // Scale the exponent by log(2).
     SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
-    SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
-                                        getF32Constant(DAG, 0x3f317218, dl));
+    SDValue LogOfExponent =
+        DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
+                    DAG.getConstantFP(numbers::ln2f, dl, MVT::f32));
 
     // Get the significand and build it into a floating-point number with
     // exponent of 1.
@@ -5311,19 +5382,32 @@ static SDValue ExpandPowI(const SDLoc &DL, SDValue LHS, SDValue RHS,
   return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS);
 }
 
-// getUnderlyingArgReg - Find underlying register used for a truncated or
-// bitcasted argument.
-static unsigned getUnderlyingArgReg(const SDValue &N) {
+// getUnderlyingArgRegs - Find underlying registers used for a truncated,
+// bitcasted, or split argument. Returns a list of <Register, size in bits>
+static void
+getUnderlyingArgRegs(SmallVectorImpl<std::pair<unsigned, unsigned>> &Regs,
+                     const SDValue &N) {
   switch (N.getOpcode()) {
-  case ISD::CopyFromReg:
-    return cast<RegisterSDNode>(N.getOperand(1))->getReg();
+  case ISD::CopyFromReg: {
+    SDValue Op = N.getOperand(1);
+    Regs.emplace_back(cast<RegisterSDNode>(Op)->getReg(),
+                      Op.getValueType().getSizeInBits());
+    return;
+  }
   case ISD::BITCAST:
   case ISD::AssertZext:
   case ISD::AssertSext:
   case ISD::TRUNCATE:
-    return getUnderlyingArgReg(N.getOperand(0));
+    getUnderlyingArgRegs(Regs, N.getOperand(0));
+    return;
+  case ISD::BUILD_PAIR:
+  case ISD::BUILD_VECTOR:
+  case ISD::CONCAT_VECTORS:
+    for (SDValue Op : N->op_values())
+      getUnderlyingArgRegs(Regs, Op);
+    return;
   default:
-    return 0;
+    return;
   }
 }
 
@@ -5412,11 +5496,16 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
   if (FI != std::numeric_limits<int>::max())
     Op = MachineOperand::CreateFI(FI);
 
+  SmallVector<std::pair<unsigned, unsigned>, 8> ArgRegsAndSizes;
   if (!Op && N.getNode()) {
-    unsigned Reg = getUnderlyingArgReg(N);
-    if (Reg && TargetRegisterInfo::isVirtualRegister(Reg)) {
+    getUnderlyingArgRegs(ArgRegsAndSizes, N);
+    Register Reg;
+    if (ArgRegsAndSizes.size() == 1)
+      Reg = ArgRegsAndSizes.front().first;
+
+    if (Reg && Reg.isVirtual()) {
       MachineRegisterInfo &RegInfo = MF.getRegInfo();
-      unsigned PR = RegInfo.getLiveInPhysReg(Reg);
+      Register PR = RegInfo.getLiveInPhysReg(Reg);
       if (PR)
         Reg = PR;
     }
@@ -5436,29 +5525,42 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
   }
 
   if (!Op) {
+    // Create a DBG_VALUE for each decomposed value in ArgRegs to cover Reg
+    auto splitMultiRegDbgValue
+      = [&](ArrayRef<std::pair<unsigned, unsigned>> SplitRegs) {
+      unsigned Offset = 0;
+      for (auto RegAndSize : SplitRegs) {
+        auto FragmentExpr = DIExpression::createFragmentExpression(
+          Expr, Offset, RegAndSize.second);
+        if (!FragmentExpr)
+          continue;
+        assert(!IsDbgDeclare && "DbgDeclare operand is not in memory?");
+        FuncInfo.ArgDbgValues.push_back(
+          BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), false,
+                  RegAndSize.first, Variable, *FragmentExpr));
+        Offset += RegAndSize.second;
+      }
+    };
+
     // Check if ValueMap has reg number.
-    DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
+    DenseMap<const Value *, unsigned>::const_iterator
+      VMI = FuncInfo.ValueMap.find(V);
     if (VMI != FuncInfo.ValueMap.end()) {
       const auto &TLI = DAG.getTargetLoweringInfo();
       RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), VMI->second,
                        V->getType(), getABIRegCopyCC(V));
       if (RFV.occupiesMultipleRegs()) {
-        unsigned Offset = 0;
-        for (auto RegAndSize : RFV.getRegsAndSizes()) {
-          Op = MachineOperand::CreateReg(RegAndSize.first, false);
-          auto FragmentExpr = DIExpression::createFragmentExpression(
-              Expr, Offset, RegAndSize.second);
-          if (!FragmentExpr)
-            continue;
-          FuncInfo.ArgDbgValues.push_back(
-              BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsDbgDeclare,
-                      Op->getReg(), Variable, *FragmentExpr));
-          Offset += RegAndSize.second;
-        }
+        splitMultiRegDbgValue(RFV.getRegsAndSizes());
         return true;
       }
+
       Op = MachineOperand::CreateReg(VMI->second, false);
       IsIndirect = IsDbgDeclare;
+    } else if (ArgRegsAndSizes.size() > 1) {
+      // This was split due to the calling convention, and no virtual register
+      // mapping exists for the value.
+      splitMultiRegDbgValue(ArgRegsAndSizes);
+      return true;
     }
   }
 
@@ -5468,8 +5570,10 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
   assert(Variable->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
   IsIndirect = (Op->isReg()) ? IsIndirect : true;
+  if (IsIndirect)
+    Expr = DIExpression::append(Expr, {dwarf::DW_OP_deref});
   FuncInfo.ArgDbgValues.push_back(
-      BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsIndirect,
+      BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), false,
               *Op, Variable, Expr));
 
   return true;
@@ -5554,11 +5658,11 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   case Intrinsic::sponentry:
     setValue(&I, DAG.getNode(ISD::SPONENTRY, sdl,
-                             TLI.getPointerTy(DAG.getDataLayout())));
+                             TLI.getFrameIndexTy(DAG.getDataLayout())));
     return;
   case Intrinsic::frameaddress:
     setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl,
-                             TLI.getPointerTy(DAG.getDataLayout()),
+                             TLI.getFrameIndexTy(DAG.getDataLayout()),
                              getValue(I.getArgOperand(0))));
     return;
   case Intrinsic::read_register: {
@@ -5888,65 +5992,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::masked_compressstore:
     visitMaskedStore(I, true /* IsCompressing */);
     return;
-  case Intrinsic::x86_mmx_pslli_w:
-  case Intrinsic::x86_mmx_pslli_d:
-  case Intrinsic::x86_mmx_pslli_q:
-  case Intrinsic::x86_mmx_psrli_w:
-  case Intrinsic::x86_mmx_psrli_d:
-  case Intrinsic::x86_mmx_psrli_q:
-  case Intrinsic::x86_mmx_psrai_w:
-  case Intrinsic::x86_mmx_psrai_d: {
-    SDValue ShAmt = getValue(I.getArgOperand(1));
-    if (isa<ConstantSDNode>(ShAmt)) {
-      visitTargetIntrinsic(I, Intrinsic);
-      return;
-    }
-    unsigned NewIntrinsic = 0;
-    EVT ShAmtVT = MVT::v2i32;
-    switch (Intrinsic) {
-    case Intrinsic::x86_mmx_pslli_w:
-      NewIntrinsic = Intrinsic::x86_mmx_psll_w;
-      break;
-    case Intrinsic::x86_mmx_pslli_d:
-      NewIntrinsic = Intrinsic::x86_mmx_psll_d;
-      break;
-    case Intrinsic::x86_mmx_pslli_q:
-      NewIntrinsic = Intrinsic::x86_mmx_psll_q;
-      break;
-    case Intrinsic::x86_mmx_psrli_w:
-      NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
-      break;
-    case Intrinsic::x86_mmx_psrli_d:
-      NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
-      break;
-    case Intrinsic::x86_mmx_psrli_q:
-      NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
-      break;
-    case Intrinsic::x86_mmx_psrai_w:
-      NewIntrinsic = Intrinsic::x86_mmx_psra_w;
-      break;
-    case Intrinsic::x86_mmx_psrai_d:
-      NewIntrinsic = Intrinsic::x86_mmx_psra_d;
-      break;
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    }
-
-    // The vector shift intrinsics with scalars uses 32b shift amounts but
-    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
-    // to be zero.
-    // We must do this early because v2i32 is not a legal type.
-    SDValue ShOps[2];
-    ShOps[0] = ShAmt;
-    ShOps[1] = DAG.getConstant(0, sdl, MVT::i32);
-    ShAmt =  DAG.getBuildVector(ShAmtVT, sdl, ShOps);
-    EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
-    ShAmt = DAG.getNode(ISD::BITCAST, sdl, DestVT, ShAmt);
-    Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, DestVT,
-                       DAG.getConstant(NewIntrinsic, sdl, MVT::i32),
-                       getValue(I.getArgOperand(0)), ShAmt);
-    setValue(&I, Res);
-    return;
-  }
   case Intrinsic::powi:
     setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),
                             getValue(I.getArgOperand(1)), DAG));
@@ -6063,6 +6108,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::experimental_constrained_fdiv:
   case Intrinsic::experimental_constrained_frem:
   case Intrinsic::experimental_constrained_fma:
+  case Intrinsic::experimental_constrained_fptosi:
+  case Intrinsic::experimental_constrained_fptoui:
   case Intrinsic::experimental_constrained_fptrunc:
   case Intrinsic::experimental_constrained_fpext:
   case Intrinsic::experimental_constrained_sqrt:
@@ -6075,12 +6122,16 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::experimental_constrained_log:
   case Intrinsic::experimental_constrained_log10:
   case Intrinsic::experimental_constrained_log2:
+  case Intrinsic::experimental_constrained_lrint:
+  case Intrinsic::experimental_constrained_llrint:
   case Intrinsic::experimental_constrained_rint:
   case Intrinsic::experimental_constrained_nearbyint:
   case Intrinsic::experimental_constrained_maxnum:
   case Intrinsic::experimental_constrained_minnum:
   case Intrinsic::experimental_constrained_ceil:
   case Intrinsic::experimental_constrained_floor:
+  case Intrinsic::experimental_constrained_lround:
+  case Intrinsic::experimental_constrained_llround:
   case Intrinsic::experimental_constrained_round:
   case Intrinsic::experimental_constrained_trunc:
     visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(I));
@@ -6272,6 +6323,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                              Op3));
     return;
   }
+  case Intrinsic::umul_fix_sat: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    setValue(&I, DAG.getNode(ISD::UMULFIXSAT, sdl, Op1.getValueType(), Op1, Op2,
+                             Op3));
+    return;
+  }
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
     Res = DAG.getNode(
@@ -6347,29 +6406,11 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     DAG.setRoot(Res);
     return;
   }
-  case Intrinsic::objectsize: {
-    // If we don't know by now, we're never going to know.
-    ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(1));
-
-    assert(CI && "Non-constant type in __builtin_object_size?");
-
-    SDValue Arg = getValue(I.getCalledValue());
-    EVT Ty = Arg.getValueType();
-
-    if (CI->isZero())
-      Res = DAG.getConstant(-1ULL, sdl, Ty);
-    else
-      Res = DAG.getConstant(0, sdl, Ty);
-
-    setValue(&I, Res);
-    return;
-  }
+  case Intrinsic::objectsize:
+    llvm_unreachable("llvm.objectsize.* should have been lowered already");
 
   case Intrinsic::is_constant:
-    // If this wasn't constant-folded away by now, then it's not a
-    // constant.
-    setValue(&I, DAG.getConstant(0, sdl, MVT::i1));
-    return;
+    llvm_unreachable("llvm.is.constant.* should have been lowered already");
 
   case Intrinsic::annotation:
   case Intrinsic::ptr_annotation:
@@ -6818,6 +6859,17 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     setValue(&I, Val);
     return;
   }
+  case Intrinsic::ptrmask: {
+    SDValue Ptr = getValue(I.getOperand(0));
+    SDValue Const = getValue(I.getOperand(1));
+
+    EVT DestVT =
+        EVT(DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()));
+
+    setValue(&I, DAG.getNode(ISD::AND, getCurSDLoc(), DestVT, Ptr,
+                             DAG.getZExtOrTrunc(Const, getCurSDLoc(), DestVT)));
+    return;
+  }
   }
 }
 
@@ -6845,6 +6897,12 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
   case Intrinsic::experimental_constrained_fma:
     Opcode = ISD::STRICT_FMA;
     break;
+  case Intrinsic::experimental_constrained_fptosi:
+    Opcode = ISD::STRICT_FP_TO_SINT;
+    break;
+  case Intrinsic::experimental_constrained_fptoui:
+    Opcode = ISD::STRICT_FP_TO_UINT;
+    break;
   case Intrinsic::experimental_constrained_fptrunc:
     Opcode = ISD::STRICT_FP_ROUND;
     break;
@@ -6881,6 +6939,12 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
   case Intrinsic::experimental_constrained_log2:
     Opcode = ISD::STRICT_FLOG2;
     break;
+  case Intrinsic::experimental_constrained_lrint:
+    Opcode = ISD::STRICT_LRINT;
+    break;
+  case Intrinsic::experimental_constrained_llrint:
+    Opcode = ISD::STRICT_LLRINT;
+    break;
   case Intrinsic::experimental_constrained_rint:
     Opcode = ISD::STRICT_FRINT;
     break;
@@ -6899,6 +6963,12 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
   case Intrinsic::experimental_constrained_floor:
     Opcode = ISD::STRICT_FFLOOR;
     break;
+  case Intrinsic::experimental_constrained_lround:
+    Opcode = ISD::STRICT_LROUND;
+    break;
+  case Intrinsic::experimental_constrained_llround:
+    Opcode = ISD::STRICT_LLROUND;
+    break;
   case Intrinsic::experimental_constrained_round:
     Opcode = ISD::STRICT_FROUND;
     break;
@@ -7102,7 +7172,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   if (SwiftErrorVal && TLI.supportSwiftError()) {
     // Get the last element of InVals.
     SDValue Src = CLI.InVals.back();
-    unsigned VReg = SwiftError.getOrCreateVRegDefAt(
+    Register VReg = SwiftError.getOrCreateVRegDefAt(
         CS.getInstruction(), FuncInfo.MBB, SwiftErrorVal);
     SDValue CopyNode = CLI.DAG.getCopyToReg(Result.second, CLI.DL, VReg, Src);
     DAG.setRoot(CopyNode);
@@ -8021,6 +8091,14 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     // Compute the constraint code and ConstraintType to use.
     TLI.ComputeConstraintToUse(T, SDValue());
 
+    if (T.ConstraintType == TargetLowering::C_Immediate &&
+        OpInfo.CallOperand && !isa<ConstantSDNode>(OpInfo.CallOperand))
+      // We've delayed emitting a diagnostic like the "n" constraint because
+      // inlining could cause an integer showing up.
+      return emitInlineAsmError(
+          CS, "constraint '" + Twine(T.ConstraintCode) + "' expects an "
+                  "integer constant expression");
+
     ExtraInfo.update(T);
   }
 
@@ -8105,7 +8183,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     switch (OpInfo.Type) {
     case InlineAsm::isOutput:
       if (OpInfo.ConstraintType == TargetLowering::C_Memory ||
-          (OpInfo.ConstraintType == TargetLowering::C_Other &&
+          ((OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+            OpInfo.ConstraintType == TargetLowering::C_Other) &&
            OpInfo.isIndirect)) {
         unsigned ConstraintID =
             TLI.getInlineAsmMemConstraint(OpInfo.ConstraintCode);
@@ -8119,13 +8198,14 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
                                                         MVT::i32));
         AsmNodeOperands.push_back(OpInfo.CallOperand);
         break;
-      } else if ((OpInfo.ConstraintType == TargetLowering::C_Other &&
+      } else if (((OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+                   OpInfo.ConstraintType == TargetLowering::C_Other) &&
                   !OpInfo.isIndirect) ||
                  OpInfo.ConstraintType == TargetLowering::C_Register ||
                  OpInfo.ConstraintType == TargetLowering::C_RegisterClass) {
         // Otherwise, this outputs to a register (directly for C_Register /
-        // C_RegisterClass, and a target-defined fashion for C_Other). Find a
-        // register that we can use.
+        // C_RegisterClass, and a target-defined fashion for
+        // C_Immediate/C_Other). Find a register that we can use.
         if (OpInfo.AssignedRegs.Regs.empty()) {
           emitInlineAsmError(
               CS, "couldn't allocate output register for constraint '" +
@@ -8205,15 +8285,24 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       }
 
       // Treat indirect 'X' constraint as memory.
-      if (OpInfo.ConstraintType == TargetLowering::C_Other &&
+      if ((OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+           OpInfo.ConstraintType == TargetLowering::C_Other) &&
           OpInfo.isIndirect)
         OpInfo.ConstraintType = TargetLowering::C_Memory;
 
-      if (OpInfo.ConstraintType == TargetLowering::C_Other) {
+      if (OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+          OpInfo.ConstraintType == TargetLowering::C_Other) {
         std::vector<SDValue> Ops;
         TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
                                           Ops, DAG);
         if (Ops.empty()) {
+          if (OpInfo.ConstraintType == TargetLowering::C_Immediate)
+            if (isa<ConstantSDNode>(InOperandVal)) {
+              emitInlineAsmError(CS, "value out of range for constraint '" +
+                                 Twine(OpInfo.ConstraintCode) + "'");
+              return;
+            }
+
           emitInlineAsmError(CS, "invalid operand for inline asm constraint '" +
                                      Twine(OpInfo.ConstraintCode) + "'");
           return;
@@ -8250,7 +8339,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       }
 
       assert((OpInfo.ConstraintType == TargetLowering::C_RegisterClass ||
-              OpInfo.ConstraintType == TargetLowering::C_Register) &&
+              OpInfo.ConstraintType == TargetLowering::C_Register ||
+              OpInfo.ConstraintType == TargetLowering::C_Immediate) &&
              "Unknown constraint type!");
 
       // TODO: Support this.
@@ -8356,6 +8446,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         Val = OpInfo.AssignedRegs.getCopyFromRegs(
             DAG, FuncInfo, getCurSDLoc(), Chain, &Flag, CS.getInstruction());
         break;
+      case TargetLowering::C_Immediate:
       case TargetLowering::C_Other:
         Val = TLI.LowerAsmOutputForConstraint(Chain, Flag, getCurSDLoc(),
                                               OpInfo, DAG);
@@ -9018,7 +9109,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       // Certain targets (such as MIPS), may have a different ABI alignment
       // for a type depending on the context. Give the target a chance to
       // specify the alignment it wants.
-      unsigned OriginalAlignment = getABIAlignmentForCallingConv(ArgTy, DL);
+      const Align OriginalAlignment(getABIAlignmentForCallingConv(ArgTy, DL));
 
       if (Args[i].Ty->isPointerTy()) {
         Flags.setPointer();
@@ -9073,7 +9164,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
           FrameAlign = Args[i].Alignment;
         else
           FrameAlign = getByValTypeAlignment(ElementTy, DL);
-        Flags.setByValAlign(FrameAlign);
+        Flags.setByValAlign(Align(FrameAlign));
       }
       if (Args[i].IsNest)
         Flags.setNest();
@@ -9129,7 +9220,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         if (NumParts > 1 && j == 0)
           MyFlags.Flags.setSplit();
         else if (j != 0) {
-          MyFlags.Flags.setOrigAlign(1);
+          MyFlags.Flags.setOrigAlign(Align::None());
           if (j == NumParts - 1)
             MyFlags.Flags.setSplitEnd();
         }
@@ -9259,7 +9350,7 @@ SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
   assert((Op.getOpcode() != ISD::CopyFromReg ||
           cast<RegisterSDNode>(Op.getOperand(1))->getReg() != Reg) &&
          "Copy from a reg to the same reg!");
-  assert(!TargetRegisterInfo::isPhysicalRegister(Reg) && "Is a physreg");
+  assert(!Register::isPhysicalRegister(Reg) && "Is a physreg");
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   // If this is an InlineAsm we have to match the registers required, not the
@@ -9516,8 +9607,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       // Certain targets (such as MIPS), may have a different ABI alignment
       // for a type depending on the context. Give the target a chance to
       // specify the alignment it wants.
-      unsigned OriginalAlignment =
-          TLI->getABIAlignmentForCallingConv(ArgTy, DL);
+      const Align OriginalAlignment(
+          TLI->getABIAlignmentForCallingConv(ArgTy, DL));
 
       if (Arg.getType()->isPointerTy()) {
         Flags.setPointer();
@@ -9577,7 +9668,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
           FrameAlign = Arg.getParamAlignment();
         else
           FrameAlign = TLI->getByValTypeAlignment(ElementTy, DL);
-        Flags.setByValAlign(FrameAlign);
+        Flags.setByValAlign(Align(FrameAlign));
       }
       if (Arg.hasAttribute(Attribute::Nest))
         Flags.setNest();
@@ -9586,6 +9677,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       Flags.setOrigAlign(OriginalAlignment);
       if (ArgCopyElisionCandidates.count(&Arg))
         Flags.setCopyElisionCandidate();
+      if (Arg.hasAttribute(Attribute::Returned))
+        Flags.setReturned();
 
       MVT RegisterVT = TLI->getRegisterTypeForCallingConv(
           *CurDAG->getContext(), F.getCallingConv(), VT);
@@ -9598,7 +9691,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
           MyFlags.Flags.setSplit();
         // if it isn't first piece, alignment must be 1
         else if (i > 0) {
-          MyFlags.Flags.setOrigAlign(1);
+          MyFlags.Flags.setOrigAlign(Align::None());
           if (i == NumRegs - 1)
             MyFlags.Flags.setSplitEnd();
         }
@@ -9650,7 +9743,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
 
     MachineFunction& MF = SDB->DAG.getMachineFunction();
     MachineRegisterInfo& RegInfo = MF.getRegInfo();
-    unsigned SRetReg = RegInfo.createVirtualRegister(TLI->getRegClassFor(RegVT));
+    Register SRetReg =
+        RegInfo.createVirtualRegister(TLI->getRegClassFor(RegVT));
     FuncInfo->DemoteRegister = SRetReg;
     NewRoot =
         SDB->DAG.getCopyToReg(NewRoot, SDB->getCurSDLoc(), SRetReg, ArgValue);
@@ -9748,10 +9842,14 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
           FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
     }
 
+    // Analyses past this point are naive and don't expect an assertion.
+    if (Res.getOpcode() == ISD::AssertZext)
+      Res = Res.getOperand(0);
+
     // Update the SwiftErrorVRegDefMap.
     if (Res.getOpcode() == ISD::CopyFromReg && isSwiftErrorArg) {
       unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg))
+      if (Register::isVirtualRegister(Reg))
         SwiftError->setCurrentVReg(FuncInfo->MBB, SwiftError->getFunctionArg(),
                                    Reg);
     }
@@ -9763,7 +9861,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       // FIXME: This isn't very clean... it would be nice to make this more
       // general.
       unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
-      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      if (Register::isVirtualRegister(Reg)) {
         FuncInfo->ValueMap[&Arg] = Reg;
         continue;
       }
@@ -10087,8 +10185,6 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
         break;
       }
       case CC_BitTests: {
-        // FIXME: If Fallthrough is unreachable, skip the range check.
-
         // FIXME: Optimize away range check based on pivot comparisons.
         BitTestBlock *BTB = &SL->BitTestCases[I->BTCasesIndex];
 
@@ -10109,6 +10205,11 @@ void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
           BTB->DefaultProb -= DefaultProb / 2;
         }
 
+        if (FallthroughUnreachable) {
+          // Skip the range check if the fallthrough block is unreachable.
+          BTB->OmitRangeCheck = true;
+        }
+
         // If we're in the right place, emit the bit test header right now.
         if (CurMBB == SwitchMBB) {
           visitBitTestHeader(*BTB, SwitchMBB);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 0072e33f23b7..bfcf30b430b6 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -426,7 +426,7 @@ public:
   SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo,
                       SwiftErrorValueTracking &swifterror, CodeGenOpt::Level ol)
       : SDNodeOrder(LowestSDNodeOrder), TM(dag.getTarget()), DAG(dag),
-        SL(make_unique<SDAGSwitchLowering>(this, funcinfo)), FuncInfo(funcinfo),
+        SL(std::make_unique<SDAGSwitchLowering>(this, funcinfo)), FuncInfo(funcinfo),
         SwiftError(swifterror) {}
 
   void init(GCFunctionInfo *gfi, AliasAnalysis *AA,
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index da3049881d31..bc10f7621239 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -280,6 +280,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::EXTRACT_SUBVECTOR:          return "extract_subvector";
   case ISD::SCALAR_TO_VECTOR:           return "scalar_to_vector";
   case ISD::VECTOR_SHUFFLE:             return "vector_shuffle";
+  case ISD::SPLAT_VECTOR:               return "splat_vector";
   case ISD::CARRY_FALSE:                return "carry_false";
   case ISD::ADDC:                       return "addc";
   case ISD::ADDE:                       return "adde";
@@ -305,6 +306,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::SMULFIX:                    return "smulfix";
   case ISD::SMULFIXSAT:                 return "smulfixsat";
   case ISD::UMULFIX:                    return "umulfix";
+  case ISD::UMULFIXSAT:                 return "umulfixsat";
 
   // Conversion operators.
   case ISD::SIGN_EXTEND:                return "sign_extend";
@@ -318,22 +320,27 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::FP_ROUND:                   return "fp_round";
   case ISD::STRICT_FP_ROUND:            return "strict_fp_round";
   case ISD::FLT_ROUNDS_:                return "flt_rounds";
-  case ISD::FP_ROUND_INREG:             return "fp_round_inreg";
   case ISD::FP_EXTEND:                  return "fp_extend";
   case ISD::STRICT_FP_EXTEND:           return "strict_fp_extend";
 
   case ISD::SINT_TO_FP:                 return "sint_to_fp";
   case ISD::UINT_TO_FP:                 return "uint_to_fp";
   case ISD::FP_TO_SINT:                 return "fp_to_sint";
+  case ISD::STRICT_FP_TO_SINT:          return "strict_fp_to_sint";
   case ISD::FP_TO_UINT:                 return "fp_to_uint";
+  case ISD::STRICT_FP_TO_UINT:          return "strict_fp_to_uint";
   case ISD::BITCAST:                    return "bitcast";
   case ISD::ADDRSPACECAST:              return "addrspacecast";
   case ISD::FP16_TO_FP:                 return "fp16_to_fp";
   case ISD::FP_TO_FP16:                 return "fp_to_fp16";
   case ISD::LROUND:                     return "lround";
+  case ISD::STRICT_LROUND:              return "strict_lround";
   case ISD::LLROUND:                    return "llround";
+  case ISD::STRICT_LLROUND:             return "strict_llround";
   case ISD::LRINT:                      return "lrint";
+  case ISD::STRICT_LRINT:               return "strict_lrint";
   case ISD::LLRINT:                     return "llrint";
+  case ISD::STRICT_LLRINT:              return "strict_llrint";
 
     // Control flow instructions
   case ISD::BR:                         return "br";
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index bdf9f2c166e1..1f07a241a824 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -434,9 +435,9 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   TII = MF->getSubtarget().getInstrInfo();
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
-  LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(Fn);
   GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
-  ORE = make_unique<OptimizationRemarkEmitter>(&Fn);
+  ORE = std::make_unique<OptimizationRemarkEmitter>(&Fn);
   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
   auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
@@ -524,8 +525,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       To = J->second;
     }
     // Make sure the new register has a sufficiently constrained register class.
-    if (TargetRegisterInfo::isVirtualRegister(From) &&
-        TargetRegisterInfo::isVirtualRegister(To))
+    if (Register::isVirtualRegister(From) && Register::isVirtualRegister(To))
       MRI.constrainRegClass(To, MRI.getRegClass(From));
     // Replace it.
 
@@ -572,7 +572,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
     bool hasFI = MI->getOperand(0).isFI();
     Register Reg =
         hasFI ? TRI.getFrameRegister(*MF) : MI->getOperand(0).getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    if (Register::isPhysicalRegister(Reg))
       EntryMBB->insert(EntryMBB->begin(), MI);
     else {
       MachineInstr *Def = RegInfo->getVRegDef(Reg);
@@ -582,7 +582,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
         Def->getParent()->insert(std::next(InsertPos), MI);
       } else
         LLVM_DEBUG(dbgs() << "Dropping debug info for dead vreg"
-                          << TargetRegisterInfo::virtReg2Index(Reg) << "\n");
+                          << Register::virtReg2Index(Reg) << "\n");
     }
 
     // If Reg is live-in then update debug info to track its copy in a vreg.
@@ -671,8 +671,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
       To = J->second;
     }
     // Make sure the new register has a sufficiently constrained register class.
-    if (TargetRegisterInfo::isVirtualRegister(From) &&
-        TargetRegisterInfo::isVirtualRegister(To))
+    if (Register::isVirtualRegister(From) && Register::isVirtualRegister(To))
       MRI.constrainRegClass(To, MRI.getRegClass(From));
     // Replace it.
 
@@ -760,7 +759,7 @@ void SelectionDAGISel::ComputeLiveOutVRegInfo() {
       continue;
 
     unsigned DestReg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
-    if (!TargetRegisterInfo::isVirtualRegister(DestReg))
+    if (!Register::isVirtualRegister(DestReg))
       continue;
 
     // Ignore non-integer values.
@@ -1652,9 +1651,8 @@ static bool MIIsInTerminatorSequence(const MachineInstr &MI) {
 
   // Make sure that the copy dest is not a vreg when the copy source is a
   // physical register.
-  if (!OPI2->isReg() ||
-      (!TargetRegisterInfo::isPhysicalRegister(OPI->getReg()) &&
-       TargetRegisterInfo::isPhysicalRegister(OPI2->getReg())))
+  if (!OPI2->isReg() || (!Register::isPhysicalRegister(OPI->getReg()) &&
+                         Register::isPhysicalRegister(OPI2->getReg())))
     return false;
 
   return true;
@@ -2234,9 +2232,9 @@ void SelectionDAGISel::Select_READ_REGISTER(SDNode *Op) {
   SDLoc dl(Op);
   MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(Op->getOperand(1));
   const MDString *RegStr = dyn_cast<MDString>(MD->getMD()->getOperand(0));
-  unsigned Reg =
+  Register Reg =
       TLI->getRegisterByName(RegStr->getString().data(), Op->getValueType(0),
-                             *CurDAG);
+                             CurDAG->getMachineFunction());
   SDValue New = CurDAG->getCopyFromReg(
                         Op->getOperand(0), dl, Reg, Op->getValueType(0));
   New->setNodeId(-1);
@@ -2248,9 +2246,9 @@ void SelectionDAGISel::Select_WRITE_REGISTER(SDNode *Op) {
   SDLoc dl(Op);
   MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(Op->getOperand(1));
   const MDString *RegStr = dyn_cast<MDString>(MD->getMD()->getOperand(0));
-  unsigned Reg = TLI->getRegisterByName(RegStr->getString().data(),
+  Register Reg = TLI->getRegisterByName(RegStr->getString().data(),
                                         Op->getOperand(2).getValueType(),
-                                        *CurDAG);
+                                        CurDAG->getMachineFunction());
   SDValue New = CurDAG->getCopyToReg(
                         Op->getOperand(0), dl, Reg, Op->getOperand(2));
   New->setNodeId(-1);
@@ -3323,10 +3321,13 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       continue;
     }
 
-    case OPC_EmitCopyToReg: {
+    case OPC_EmitCopyToReg:
+    case OPC_EmitCopyToReg2: {
       unsigned RecNo = MatcherTable[MatcherIndex++];
       assert(RecNo < RecordedNodes.size() && "Invalid EmitCopyToReg");
       unsigned DestPhysReg = MatcherTable[MatcherIndex++];
+      if (Opcode == OPC_EmitCopyToReg2)
+        DestPhysReg |= MatcherTable[MatcherIndex++] << 8;
 
       if (!InputChain.getNode())
         InputChain = CurDAG->getEntryNode();
diff --git a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index 395e9a8a4fc5..fad98b6f50dc 100644
--- a/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -378,7 +378,6 @@ spillIncomingStatepointValue(SDValue Incoming, SDValue Chain,
     // We use TargetFrameIndex so that isel will not select it into LEA
     Loc = Builder.DAG.getTargetFrameIndex(Index, Builder.getFrameIndexTy());
 
-#ifndef NDEBUG
     // Right now we always allocate spill slots that are of the same
     // size as the value we're about to spill (the size of spillee can
     // vary since we spill vectors of pointers too).  At some point we
@@ -387,12 +386,18 @@ spillIncomingStatepointValue(SDValue Incoming, SDValue Chain,
     MachineFrameInfo &MFI = Builder.DAG.getMachineFunction().getFrameInfo();
     assert((MFI.getObjectSize(Index) * 8) == Incoming.getValueSizeInBits() &&
            "Bad spill:  stack slot does not match!");
-#endif
 
+    // Note: Using the alignment of the spill slot (rather than the abi or
+    // preferred alignment) is required for correctness when dealing with spill
+    // slots with preferred alignments larger than frame alignment..
     auto &MF = Builder.DAG.getMachineFunction();
     auto PtrInfo = MachinePointerInfo::getFixedStack(MF, Index);
+    auto *StoreMMO =
+      MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 
+                              MFI.getObjectSize(Index),
+                              MFI.getObjectAlignment(Index));
     Chain = Builder.DAG.getStore(Chain, Builder.getCurSDLoc(), Incoming, Loc,
-                                 PtrInfo);
+                                 StoreMMO);
 
     MMO = getMachineMemOperand(MF, *cast<FrameIndexSDNode>(Loc));
     
@@ -1011,20 +1016,27 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
     return;
   }
 
-  SDValue SpillSlot =
-    DAG.getTargetFrameIndex(*DerivedPtrLocation, getFrameIndexTy());
+  unsigned Index = *DerivedPtrLocation;
+  SDValue SpillSlot = DAG.getTargetFrameIndex(Index, getFrameIndexTy());
 
   // Note: We know all of these reloads are independent, but don't bother to
   // exploit that chain wise.  DAGCombine will happily do so as needed, so
   // doing it here would be a small compile time win at most.
   SDValue Chain = getRoot();
 
-  SDValue SpillLoad =
-      DAG.getLoad(DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
-                                                           Relocate.getType()),
-                  getCurSDLoc(), Chain, SpillSlot,
-                  MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
-                                                    *DerivedPtrLocation));
+  auto &MF = DAG.getMachineFunction();
+  auto &MFI = MF.getFrameInfo();
+  auto PtrInfo = MachinePointerInfo::getFixedStack(MF, Index);
+  auto *LoadMMO =
+    MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 
+                            MFI.getObjectSize(Index),
+                            MFI.getObjectAlignment(Index));
+
+  auto LoadVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                         Relocate.getType());
+
+  SDValue SpillLoad = DAG.getLoad(LoadVT, getCurSDLoc(), Chain,
+                                  SpillSlot, LoadMMO);
 
   DAG.setRoot(SpillLoad.getValue(1));
 
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b260cd91d468..9ab1324533f1 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/TargetLowering.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -37,7 +36,7 @@ using namespace llvm;
 
 /// NOTE: The TargetMachine owns TLOF.
 TargetLowering::TargetLowering(const TargetMachine &tm)
-  : TargetLoweringBase(tm) {}
+    : TargetLoweringBase(tm) {}
 
 const char *TargetLowering::getTargetNodeName(unsigned Opcode) const {
   return nullptr;
@@ -80,7 +79,7 @@ bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI,
     const CCValAssign &ArgLoc = ArgLocs[I];
     if (!ArgLoc.isRegLoc())
       continue;
-    unsigned Reg = ArgLoc.getLocReg();
+    Register Reg = ArgLoc.getLocReg();
     // Only look at callee saved registers.
     if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg))
       continue;
@@ -121,19 +120,25 @@ void TargetLoweringBase::ArgListEntry::setAttributes(const CallBase *Call,
 /// result of type RetVT.
 std::pair<SDValue, SDValue>
 TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
-                            ArrayRef<SDValue> Ops, bool isSigned,
-                            const SDLoc &dl, bool doesNotReturn,
-                            bool isReturnValueUsed,
-                            bool isPostTypeLegalization) const {
+                            ArrayRef<SDValue> Ops,
+                            MakeLibCallOptions CallOptions,
+                            const SDLoc &dl) const {
   TargetLowering::ArgListTy Args;
   Args.reserve(Ops.size());
 
   TargetLowering::ArgListEntry Entry;
-  for (SDValue Op : Ops) {
-    Entry.Node = Op;
+  for (unsigned i = 0; i < Ops.size(); ++i) {
+    SDValue NewOp = Ops[i];
+    Entry.Node = NewOp;
     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
-    Entry.IsSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
-    Entry.IsZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
+    Entry.IsSExt = shouldSignExtendTypeInLibCall(NewOp.getValueType(),
+                                                 CallOptions.IsSExt);
+    Entry.IsZExt = !Entry.IsSExt;
+
+    if (CallOptions.IsSoften &&
+        !shouldExtendTypeInLibCall(CallOptions.OpsVTBeforeSoften[i])) {
+      Entry.IsSExt = Entry.IsZExt = false;
+    }
     Args.push_back(Entry);
   }
 
@@ -144,15 +149,22 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
 
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
   TargetLowering::CallLoweringInfo CLI(DAG);
-  bool signExtend = shouldSignExtendTypeInLibCall(RetVT, isSigned);
+  bool signExtend = shouldSignExtendTypeInLibCall(RetVT, CallOptions.IsSExt);
+  bool zeroExtend = !signExtend;
+
+  if (CallOptions.IsSoften &&
+      !shouldExtendTypeInLibCall(CallOptions.RetVTBeforeSoften)) {
+    signExtend = zeroExtend = false;
+  }
+
   CLI.setDebugLoc(dl)
       .setChain(DAG.getEntryNode())
       .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-      .setNoReturn(doesNotReturn)
-      .setDiscardResult(!isReturnValueUsed)
-      .setIsPostTypeLegalization(isPostTypeLegalization)
+      .setNoReturn(CallOptions.DoesNotReturn)
+      .setDiscardResult(!CallOptions.IsReturnValueUsed)
+      .setIsPostTypeLegalization(CallOptions.IsPostTypeLegalization)
       .setSExtResult(signExtend)
-      .setZExtResult(!signExtend);
+      .setZExtResult(zeroExtend);
   return LowerCallTo(CLI);
 }
 
@@ -263,7 +275,8 @@ TargetLowering::findOptimalMemOpLowering(std::vector<EVT> &MemOps,
 void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
                                          SDValue &NewLHS, SDValue &NewRHS,
                                          ISD::CondCode &CCCode,
-                                         const SDLoc &dl) const {
+                                         const SDLoc &dl, const SDValue OldLHS,
+                                         const SDValue OldRHS) const {
   assert((VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128 || VT == MVT::ppcf128)
          && "Unsupported setcc type!");
 
@@ -365,8 +378,11 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
   // Use the target specific return value for comparions lib calls.
   EVT RetVT = getCmpLibcallReturnType();
   SDValue Ops[2] = {NewLHS, NewRHS};
-  NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, false /*sign irrelevant*/,
-                       dl).first;
+  TargetLowering::MakeLibCallOptions CallOptions;
+  EVT OpsVT[2] = { OldLHS.getValueType(),
+                   OldRHS.getValueType() };
+  CallOptions.setTypeListBeforeSoften(OpsVT, RetVT, true);
+  NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, CallOptions, dl).first;
   NewRHS = DAG.getConstant(0, dl, RetVT);
 
   CCCode = getCmpLibcallCC(LC1);
@@ -378,8 +394,7 @@ void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
         ISD::SETCC, dl,
         getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), RetVT),
         NewLHS, NewRHS, DAG.getCondCode(CCCode));
-    NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, false/*sign irrelevant*/,
-                         dl).first;
+    NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, CallOptions, dl).first;
     NewLHS = DAG.getNode(
         ISD::SETCC, dl,
         getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), RetVT),
@@ -564,6 +579,170 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
                               AssumeSingleUse);
 }
 
+// TODO: Can we merge SelectionDAG::GetDemandedBits into this?
+// TODO: Under what circumstances can we create nodes? Constant folding?
+SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
+    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+    SelectionDAG &DAG, unsigned Depth) const {
+  // Limit search depth.
+  if (Depth >= SelectionDAG::MaxRecursionDepth)
+    return SDValue();
+
+  // Ignore UNDEFs.
+  if (Op.isUndef())
+    return SDValue();
+
+  // Not demanding any bits/elts from Op.
+  if (DemandedBits == 0 || DemandedElts == 0)
+    return DAG.getUNDEF(Op.getValueType());
+
+  unsigned NumElts = DemandedElts.getBitWidth();
+  KnownBits LHSKnown, RHSKnown;
+  switch (Op.getOpcode()) {
+  case ISD::BITCAST: {
+    SDValue Src = peekThroughBitcasts(Op.getOperand(0));
+    EVT SrcVT = Src.getValueType();
+    EVT DstVT = Op.getValueType();
+    unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();
+    unsigned NumDstEltBits = DstVT.getScalarSizeInBits();
+
+    if (NumSrcEltBits == NumDstEltBits)
+      if (SDValue V = SimplifyMultipleUseDemandedBits(
+              Src, DemandedBits, DemandedElts, DAG, Depth + 1))
+        return DAG.getBitcast(DstVT, V);
+
+    // TODO - bigendian once we have test coverage.
+    if (SrcVT.isVector() && (NumDstEltBits % NumSrcEltBits) == 0 &&
+        DAG.getDataLayout().isLittleEndian()) {
+      unsigned Scale = NumDstEltBits / NumSrcEltBits;
+      unsigned NumSrcElts = SrcVT.getVectorNumElements();
+      APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits);
+      APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts);
+      for (unsigned i = 0; i != Scale; ++i) {
+        unsigned Offset = i * NumSrcEltBits;
+        APInt Sub = DemandedBits.extractBits(NumSrcEltBits, Offset);
+        if (!Sub.isNullValue()) {
+          DemandedSrcBits |= Sub;
+          for (unsigned j = 0; j != NumElts; ++j)
+            if (DemandedElts[j])
+              DemandedSrcElts.setBit((j * Scale) + i);
+        }
+      }
+
+      if (SDValue V = SimplifyMultipleUseDemandedBits(
+              Src, DemandedSrcBits, DemandedSrcElts, DAG, Depth + 1))
+        return DAG.getBitcast(DstVT, V);
+    }
+
+    // TODO - bigendian once we have test coverage.
+    if ((NumSrcEltBits % NumDstEltBits) == 0 &&
+        DAG.getDataLayout().isLittleEndian()) {
+      unsigned Scale = NumSrcEltBits / NumDstEltBits;
+      unsigned NumSrcElts = SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1;
+      APInt DemandedSrcBits = APInt::getNullValue(NumSrcEltBits);
+      APInt DemandedSrcElts = APInt::getNullValue(NumSrcElts);
+      for (unsigned i = 0; i != NumElts; ++i)
+        if (DemandedElts[i]) {
+          unsigned Offset = (i % Scale) * NumDstEltBits;
+          DemandedSrcBits.insertBits(DemandedBits, Offset);
+          DemandedSrcElts.setBit(i / Scale);
+        }
+
+      if (SDValue V = SimplifyMultipleUseDemandedBits(
+              Src, DemandedSrcBits, DemandedSrcElts, DAG, Depth + 1))
+        return DAG.getBitcast(DstVT, V);
+    }
+
+    break;
+  }
+  case ISD::AND: {
+    LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+
+    // If all of the demanded bits are known 1 on one side, return the other.
+    // These bits cannot contribute to the result of the 'and' in this
+    // context.
+    if (DemandedBits.isSubsetOf(LHSKnown.Zero | RHSKnown.One))
+      return Op.getOperand(0);
+    if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.One))
+      return Op.getOperand(1);
+    break;
+  }
+  case ISD::OR: {
+    LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+
+    // If all of the demanded bits are known zero on one side, return the
+    // other.  These bits cannot contribute to the result of the 'or' in this
+    // context.
+    if (DemandedBits.isSubsetOf(LHSKnown.One | RHSKnown.Zero))
+      return Op.getOperand(0);
+    if (DemandedBits.isSubsetOf(RHSKnown.One | LHSKnown.Zero))
+      return Op.getOperand(1);
+    break;
+  }
+  case ISD::XOR: {
+    LHSKnown = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    RHSKnown = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
+
+    // If all of the demanded bits are known zero on one side, return the
+    // other.
+    if (DemandedBits.isSubsetOf(RHSKnown.Zero))
+      return Op.getOperand(0);
+    if (DemandedBits.isSubsetOf(LHSKnown.Zero))
+      return Op.getOperand(1);
+    break;
+  }
+  case ISD::SIGN_EXTEND_INREG: {
+    // If none of the extended bits are demanded, eliminate the sextinreg.
+    EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    if (DemandedBits.getActiveBits() <= ExVT.getScalarSizeInBits())
+      return Op.getOperand(0);
+    break;
+  }
+  case ISD::INSERT_VECTOR_ELT: {
+    // If we don't demand the inserted element, return the base vector.
+    SDValue Vec = Op.getOperand(0);
+    auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    EVT VecVT = Vec.getValueType();
+    if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
+        !DemandedElts[CIdx->getZExtValue()])
+      return Vec;
+    break;
+  }
+  case ISD::VECTOR_SHUFFLE: {
+    ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(Op)->getMask();
+
+    // If all the demanded elts are from one operand and are inline,
+    // then we can use the operand directly.
+    bool AllUndef = true, IdentityLHS = true, IdentityRHS = true;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int M = ShuffleMask[i];
+      if (M < 0 || !DemandedElts[i])
+        continue;
+      AllUndef = false;
+      IdentityLHS &= (M == (int)i);
+      IdentityRHS &= ((M - NumElts) == i);
+    }
+
+    if (AllUndef)
+      return DAG.getUNDEF(Op.getValueType());
+    if (IdentityLHS)
+      return Op.getOperand(0);
+    if (IdentityRHS)
+      return Op.getOperand(1);
+    break;
+  }
+  default:
+    if (Op.getOpcode() >= ISD::BUILTIN_OP_END)
+      if (SDValue V = SimplifyMultipleUseDemandedBitsForTargetNode(
+              Op, DemandedBits, DemandedElts, DAG, Depth))
+        return V;
+    break;
+  }
+  return SDValue();
+}
+
 /// Look at Op. At this point, we know that only the OriginalDemandedBits of the
 /// result of Op are ever used downstream. If we can use this information to
 /// simplify Op, create a new simplified DAG node and return true, returning the
@@ -619,12 +798,15 @@ bool TargetLowering::SimplifyDemandedBits(
   } else if (OriginalDemandedBits == 0 || OriginalDemandedElts == 0) {
     // Not demanding any bits/elts from Op.
     return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
-  } else if (Depth == 6) { // Limit search depth.
+  } else if (Depth >= SelectionDAG::MaxRecursionDepth) {
+    // Limit search depth.
     return false;
   }
 
   KnownBits Known2, KnownOut;
   switch (Op.getOpcode()) {
+  case ISD::TargetConstant:
+    llvm_unreachable("Can't simplify this node");
   case ISD::SCALAR_TO_VECTOR: {
     if (!DemandedElts[0])
       return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT));
@@ -728,6 +910,21 @@ bool TargetLowering::SimplifyDemandedBits(
     }
     break;
   }
+  case ISD::EXTRACT_SUBVECTOR: {
+    // If index isn't constant, assume we need all the source vector elements.
+    SDValue Src = Op.getOperand(0);
+    ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    APInt SrcElts = APInt::getAllOnesValue(NumSrcElts);
+    if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
+      // Offset the demanded elts by the subvector index.
+      uint64_t Idx = SubIdx->getZExtValue();
+      SrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
+    }
+    if (SimplifyDemandedBits(Src, DemandedBits, SrcElts, Known, TLO, Depth + 1))
+      return true;
+    break;
+  }
   case ISD::CONCAT_VECTORS: {
     Known.Zero.setAllBits();
     Known.One.setAllBits();
@@ -773,22 +970,37 @@ bool TargetLowering::SimplifyDemandedBits(
     }
 
     if (!!DemandedLHS || !!DemandedRHS) {
+      SDValue Op0 = Op.getOperand(0);
+      SDValue Op1 = Op.getOperand(1);
+
       Known.Zero.setAllBits();
       Known.One.setAllBits();
       if (!!DemandedLHS) {
-        if (SimplifyDemandedBits(Op.getOperand(0), DemandedBits, DemandedLHS,
-                                 Known2, TLO, Depth + 1))
+        if (SimplifyDemandedBits(Op0, DemandedBits, DemandedLHS, Known2, TLO,
+                                 Depth + 1))
           return true;
         Known.One &= Known2.One;
         Known.Zero &= Known2.Zero;
       }
       if (!!DemandedRHS) {
-        if (SimplifyDemandedBits(Op.getOperand(1), DemandedBits, DemandedRHS,
-                                 Known2, TLO, Depth + 1))
+        if (SimplifyDemandedBits(Op1, DemandedBits, DemandedRHS, Known2, TLO,
+                                 Depth + 1))
           return true;
         Known.One &= Known2.One;
         Known.Zero &= Known2.Zero;
       }
+
+      // Attempt to avoid multi-use ops if we don't need anything from them.
+      SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+          Op0, DemandedBits, DemandedLHS, TLO.DAG, Depth + 1);
+      SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+          Op1, DemandedBits, DemandedRHS, TLO.DAG, Depth + 1);
+      if (DemandedOp0 || DemandedOp1) {
+        Op0 = DemandedOp0 ? DemandedOp0 : Op0;
+        Op1 = DemandedOp1 ? DemandedOp1 : Op1;
+        SDValue NewOp = TLO.DAG.getVectorShuffle(VT, dl, Op0, Op1, ShuffleMask);
+        return TLO.CombineTo(Op, NewOp);
+      }
     }
     break;
   }
@@ -834,6 +1046,20 @@ bool TargetLowering::SimplifyDemandedBits(
       return true;
     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!DemandedBits.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
+      SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+          Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
+      SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+          Op1, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
+      if (DemandedOp0 || DemandedOp1) {
+        Op0 = DemandedOp0 ? DemandedOp0 : Op0;
+        Op1 = DemandedOp1 ? DemandedOp1 : Op1;
+        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+
     // If all of the demanded bits are known one on one side, return the other.
     // These bits cannot contribute to the result of the 'and'.
     if (DemandedBits.isSubsetOf(Known2.Zero | Known.One))
@@ -869,6 +1095,20 @@ bool TargetLowering::SimplifyDemandedBits(
       return true;
     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!DemandedBits.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
+      SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+          Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
+      SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+          Op1, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
+      if (DemandedOp0 || DemandedOp1) {
+        Op0 = DemandedOp0 ? DemandedOp0 : Op0;
+        Op1 = DemandedOp1 ? DemandedOp1 : Op1;
+        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+
     // If all of the demanded bits are known zero on one side, return the other.
     // These bits cannot contribute to the result of the 'or'.
     if (DemandedBits.isSubsetOf(Known2.One | Known.Zero))
@@ -901,6 +1141,20 @@ bool TargetLowering::SimplifyDemandedBits(
       return true;
     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!DemandedBits.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
+      SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+          Op0, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
+      SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+          Op1, DemandedBits, DemandedElts, TLO.DAG, Depth + 1);
+      if (DemandedOp0 || DemandedOp1) {
+        Op0 = DemandedOp0 ? DemandedOp0 : Op0;
+        Op1 = DemandedOp1 ? DemandedOp1 : Op1;
+        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+
     // If all of the demanded bits are known zero on one side, return the other.
     // These bits cannot contribute to the result of the 'xor'.
     if (DemandedBits.isSubsetOf(Known.Zero))
@@ -1034,7 +1288,7 @@ bool TargetLowering::SimplifyDemandedBits(
       // out) are never demanded.
       // TODO - support non-uniform vector amounts.
       if (Op0.getOpcode() == ISD::SRL) {
-        if ((DemandedBits & APInt::getLowBitsSet(BitWidth, ShAmt)) == 0) {
+        if (!DemandedBits.intersects(APInt::getLowBitsSet(BitWidth, ShAmt))) {
           if (ConstantSDNode *SA2 =
                   isConstOrConstSplat(Op0.getOperand(1), DemandedElts)) {
             if (SA2->getAPIntValue().ult(BitWidth)) {
@@ -1141,7 +1395,8 @@ bool TargetLowering::SimplifyDemandedBits(
       if (Op0.getOpcode() == ISD::SHL) {
         if (ConstantSDNode *SA2 =
                 isConstOrConstSplat(Op0.getOperand(1), DemandedElts)) {
-          if ((DemandedBits & APInt::getHighBitsSet(BitWidth, ShAmt)) == 0) {
+          if (!DemandedBits.intersects(
+                  APInt::getHighBitsSet(BitWidth, ShAmt))) {
             if (SA2->getAPIntValue().ult(BitWidth)) {
               unsigned C1 = SA2->getZExtValue();
               unsigned Opc = ISD::SRL;
@@ -1479,6 +1734,11 @@ bool TargetLowering::SimplifyDemandedBits(
       return true;
     Known = Known.trunc(BitWidth);
 
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+            Src, TruncMask, DemandedElts, TLO.DAG, Depth + 1))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::TRUNCATE, dl, VT, NewSrc));
+
     // If the input is only used by this truncate, see if we can shrink it based
     // on the known demanded bits.
     if (Src.getNode()->hasOneUse()) {
@@ -1595,9 +1855,7 @@ bool TargetLowering::SimplifyDemandedBits(
     // Bitcast from a vector using SimplifyDemanded Bits/VectorElts.
     // Demand the elt/bit if any of the original elts/bits are demanded.
     // TODO - bigendian once we have test coverage.
-    // TODO - bool vectors once SimplifyDemandedVectorElts has SETCC support.
-    if (SrcVT.isVector() && NumSrcEltBits > 1 &&
-        (BitWidth % NumSrcEltBits) == 0 &&
+    if (SrcVT.isVector() && (BitWidth % NumSrcEltBits) == 0 &&
         TLO.DAG.getDataLayout().isLittleEndian()) {
       unsigned Scale = BitWidth / NumSrcEltBits;
       unsigned NumSrcElts = SrcVT.getVectorNumElements();
@@ -1663,6 +1921,7 @@ bool TargetLowering::SimplifyDemandedBits(
     // Add, Sub, and Mul don't demand any bits in positions beyond that
     // of the highest bit demanded of them.
     SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+    SDNodeFlags Flags = Op.getNode()->getFlags();
     unsigned DemandedBitsLZ = DemandedBits.countLeadingZeros();
     APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ);
     if (SimplifyDemandedBits(Op0, LoMask, DemandedElts, Known2, TLO,
@@ -1671,7 +1930,6 @@ bool TargetLowering::SimplifyDemandedBits(
                              Depth + 1) ||
         // See if the operation should be performed at a smaller bit width.
         ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO)) {
-      SDNodeFlags Flags = Op.getNode()->getFlags();
       if (Flags.hasNoSignedWrap() || Flags.hasNoUnsignedWrap()) {
         // Disable the nsw and nuw flags. We can no longer guarantee that we
         // won't wrap after simplification.
@@ -1684,6 +1942,23 @@ bool TargetLowering::SimplifyDemandedBits(
       return true;
     }
 
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!LoMask.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
+      SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+          Op0, LoMask, DemandedElts, TLO.DAG, Depth + 1);
+      SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+          Op1, LoMask, DemandedElts, TLO.DAG, Depth + 1);
+      if (DemandedOp0 || DemandedOp1) {
+        Flags.setNoSignedWrap(false);
+        Flags.setNoUnsignedWrap(false);
+        Op0 = DemandedOp0 ? DemandedOp0 : Op0;
+        Op1 = DemandedOp1 ? DemandedOp1 : Op1;
+        SDValue NewOp =
+            TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Op1, Flags);
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
+
     // If we have a constant operand, we may be able to turn it into -1 if we
     // do not demand the high bits. This can make the constant smaller to
     // encode, allow more general folding, or match specialized instruction
@@ -1694,10 +1969,8 @@ bool TargetLowering::SimplifyDemandedBits(
     if (C && !C->isAllOnesValue() && !C->isOne() &&
         (C->getAPIntValue() | HighMask).isAllOnesValue()) {
       SDValue Neg1 = TLO.DAG.getAllOnesConstant(dl, VT);
-      // We can't guarantee that the new math op doesn't wrap, so explicitly
-      // clear those flags to prevent folding with a potential existing node
-      // that has those flags set.
-      SDNodeFlags Flags;
+      // Disable the nsw and nuw flags. We can no longer guarantee that we
+      // won't wrap after simplification.
       Flags.setNoSignedWrap(false);
       Flags.setNoUnsignedWrap(false);
       SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, Op0, Neg1, Flags);
@@ -1837,7 +2110,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
   }
 
   // Limit search depth.
-  if (Depth >= 6)
+  if (Depth >= SelectionDAG::MaxRecursionDepth)
     return false;
 
   SDLoc DL(Op);
@@ -2001,6 +2274,15 @@ bool TargetLowering::SimplifyDemandedVectorElts(
       return true;
     APInt BaseElts = DemandedElts;
     BaseElts.insertBits(APInt::getNullValue(NumSubElts), SubIdx);
+
+    // If none of the base operand elements are demanded, replace it with undef.
+    if (!BaseElts && !Base.isUndef())
+      return TLO.CombineTo(Op,
+                           TLO.DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+                                           TLO.DAG.getUNDEF(VT),
+                                           Op.getOperand(1),
+                                           Op.getOperand(2)));
+
     if (SimplifyDemandedVectorElts(Base, BaseElts, KnownUndef, KnownZero, TLO,
                                    Depth + 1))
       return true;
@@ -2134,11 +2416,13 @@ bool TargetLowering::SimplifyDemandedVectorElts(
 
     // Update legal shuffle masks based on demanded elements if it won't reduce
     // to Identity which can cause premature removal of the shuffle mask.
-    if (Updated && !IdentityLHS && !IdentityRHS && !TLO.LegalOps &&
-        isShuffleMaskLegal(NewMask, VT))
-      return TLO.CombineTo(Op,
-                           TLO.DAG.getVectorShuffle(VT, DL, Op.getOperand(0),
-                                                    Op.getOperand(1), NewMask));
+    if (Updated && !IdentityLHS && !IdentityRHS && !TLO.LegalOps) {
+      SDValue LegalShuffle =
+          buildLegalVectorShuffle(VT, DL, Op.getOperand(0), Op.getOperand(1),
+                                  NewMask, TLO.DAG);
+      if (LegalShuffle)
+        return TLO.CombineTo(Op, LegalShuffle);
+    }
 
     // Propagate undef/zero elements from LHS/RHS.
     for (unsigned i = 0; i != NumElts; ++i) {
@@ -2304,6 +2588,13 @@ void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   Known.resetAll();
 }
 
+void TargetLowering::computeKnownBitsForTargetInstr(
+    GISelKnownBits &Analysis, Register R, KnownBits &Known,
+    const APInt &DemandedElts, const MachineRegisterInfo &MRI,
+    unsigned Depth) const {
+  Known.resetAll();
+}
+
 void TargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
                                                    KnownBits &Known,
                                                    const APInt &DemandedElts,
@@ -2357,6 +2648,36 @@ bool TargetLowering::SimplifyDemandedBitsForTargetNode(
   return false;
 }
 
+SDValue TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
+    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+    SelectionDAG &DAG, unsigned Depth) const {
+  assert(
+      (Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+       Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+       Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+       Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+      "Should use SimplifyMultipleUseDemandedBits if you don't know whether Op"
+      " is a target node!");
+  return SDValue();
+}
+
+SDValue
+TargetLowering::buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0,
+                                        SDValue N1, MutableArrayRef<int> Mask,
+                                        SelectionDAG &DAG) const {
+  bool LegalMask = isShuffleMaskLegal(Mask, VT);
+  if (!LegalMask) {
+    std::swap(N0, N1);
+    ShuffleVectorSDNode::commuteMask(Mask);
+    LegalMask = isShuffleMaskLegal(Mask, VT);
+  }
+
+  if (!LegalMask)
+    return SDValue();
+
+  return DAG.getVectorShuffle(VT, DL, N0, N1, Mask);
+}
+
 const Constant *TargetLowering::getTargetConstantFromLoad(LoadSDNode*) const {
   return nullptr;
 }
@@ -2610,6 +2931,77 @@ SDValue TargetLowering::optimizeSetCCOfSignedTruncationCheck(
   return T2;
 }
 
+// (X & (C l>>/<< Y)) ==/!= 0  -->  ((X <</l>> Y) & C) ==/!= 0
+SDValue TargetLowering::optimizeSetCCByHoistingAndByConstFromLogicalShift(
+    EVT SCCVT, SDValue N0, SDValue N1C, ISD::CondCode Cond,
+    DAGCombinerInfo &DCI, const SDLoc &DL) const {
+  assert(isConstOrConstSplat(N1C) &&
+         isConstOrConstSplat(N1C)->getAPIntValue().isNullValue() &&
+         "Should be a comparison with 0.");
+  assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+         "Valid only for [in]equality comparisons.");
+
+  unsigned NewShiftOpcode;
+  SDValue X, C, Y;
+
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Look for '(C l>>/<< Y)'.
+  auto Match = [&NewShiftOpcode, &X, &C, &Y, &TLI, &DAG](SDValue V) {
+    // The shift should be one-use.
+    if (!V.hasOneUse())
+      return false;
+    unsigned OldShiftOpcode = V.getOpcode();
+    switch (OldShiftOpcode) {
+    case ISD::SHL:
+      NewShiftOpcode = ISD::SRL;
+      break;
+    case ISD::SRL:
+      NewShiftOpcode = ISD::SHL;
+      break;
+    default:
+      return false; // must be a logical shift.
+    }
+    // We should be shifting a constant.
+    // FIXME: best to use isConstantOrConstantVector().
+    C = V.getOperand(0);
+    ConstantSDNode *CC =
+        isConstOrConstSplat(C, /*AllowUndefs=*/true, /*AllowTruncation=*/true);
+    if (!CC)
+      return false;
+    Y = V.getOperand(1);
+
+    ConstantSDNode *XC =
+        isConstOrConstSplat(X, /*AllowUndefs=*/true, /*AllowTruncation=*/true);
+    return TLI.shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd(
+        X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG);
+  };
+
+  // LHS of comparison should be an one-use 'and'.
+  if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
+    return SDValue();
+
+  X = N0.getOperand(0);
+  SDValue Mask = N0.getOperand(1);
+
+  // 'and' is commutative!
+  if (!Match(Mask)) {
+    std::swap(X, Mask);
+    if (!Match(Mask))
+      return SDValue();
+  }
+
+  EVT VT = X.getValueType();
+
+  // Produce:
+  // ((X 'OppositeShiftOpcode' Y) & C) Cond 0
+  SDValue T0 = DAG.getNode(NewShiftOpcode, DL, VT, X, Y);
+  SDValue T1 = DAG.getNode(ISD::AND, DL, VT, T0, C);
+  SDValue T2 = DAG.getSetCC(DL, SCCVT, T1, N1C, Cond);
+  return T2;
+}
+
 /// Try to fold an equality comparison with a {add/sub/xor} binary operation as
 /// the 1st operand (N0). Callers are expected to swap the N0/N1 parameters to
 /// handle the commuted versions of these patterns.
@@ -2726,9 +3118,9 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       // (ctpop x) u< 2 -> (x & x-1) == 0
       // (ctpop x) u> 1 -> (x & x-1) != 0
       if ((Cond == ISD::SETULT && C1 == 2) || (Cond == ISD::SETUGT && C1 == 1)){
-        SDValue Sub = DAG.getNode(ISD::SUB, dl, CTVT, CTOp,
-                                  DAG.getConstant(1, dl, CTVT));
-        SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Sub);
+        SDValue NegOne = DAG.getAllOnesConstant(dl, CTVT);
+        SDValue Add = DAG.getNode(ISD::ADD, dl, CTVT, CTOp, NegOne);
+        SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Add);
         ISD::CondCode CC = Cond == ISD::SETULT ? ISD::SETEQ : ISD::SETNE;
         return DAG.getSetCC(dl, VT, And, DAG.getConstant(0, dl, CTVT), CC);
       }
@@ -2852,7 +3244,7 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       LoadSDNode *Lod = cast<LoadSDNode>(N0.getOperand(0));
       APInt bestMask;
       unsigned bestWidth = 0, bestOffset = 0;
-      if (!Lod->isVolatile() && Lod->isUnindexed()) {
+      if (Lod->isSimple() && Lod->isUnindexed()) {
         unsigned origWidth = N0.getValueSizeInBits();
         unsigned maskWidth = origWidth;
         // We can narrow (e.g.) 16-bit extending loads on 32-bit target to
@@ -3178,6 +3570,14 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       }
     }
 
+    if (Cond == ISD::SETEQ || Cond == ISD::SETNE) {
+      // (X & (C l>>/<< Y)) ==/!= 0  -->  ((X <</l>> Y) & C) ==/!= 0
+      if (C1.isNullValue())
+        if (SDValue CC = optimizeSetCCByHoistingAndByConstFromLogicalShift(
+                VT, N0, N1, Cond, DCI, dl))
+          return CC;
+    }
+
     // If we have "setcc X, C0", check to see if we can shrink the immediate
     // by changing cc.
     // TODO: Support this for vectors after legalize ops.
@@ -3203,33 +3603,35 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   // Back to non-vector simplifications.
   // TODO: Can we do these for vector splats?
   if (auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     const APInt &C1 = N1C->getAPIntValue();
+    EVT ShValTy = N0.getValueType();
 
     // Fold bit comparisons when we can.
     if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
-        (VT == N0.getValueType() ||
-         (isTypeLegal(VT) && VT.bitsLE(N0.getValueType()))) &&
+        (VT == ShValTy || (isTypeLegal(VT) && VT.bitsLE(ShValTy))) &&
         N0.getOpcode() == ISD::AND) {
       auto &DL = DAG.getDataLayout();
       if (auto *AndRHS = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
-        EVT ShiftTy = getShiftAmountTy(N0.getValueType(), DL,
-                                       !DCI.isBeforeLegalize());
+        EVT ShiftTy = getShiftAmountTy(ShValTy, DL, !DCI.isBeforeLegalize());
         if (Cond == ISD::SETNE && C1 == 0) {// (X & 8) != 0  -->  (X & 8) >> 3
           // Perform the xform if the AND RHS is a single bit.
-          if (AndRHS->getAPIntValue().isPowerOf2()) {
+          unsigned ShCt = AndRHS->getAPIntValue().logBase2();
+          if (AndRHS->getAPIntValue().isPowerOf2() &&
+              ShCt <= TLI.getShiftAmountThreshold(ShValTy)) {
             return DAG.getNode(ISD::TRUNCATE, dl, VT,
-                              DAG.getNode(ISD::SRL, dl, N0.getValueType(), N0,
-                   DAG.getConstant(AndRHS->getAPIntValue().logBase2(), dl,
-                                   ShiftTy)));
+                               DAG.getNode(ISD::SRL, dl, ShValTy, N0,
+                                           DAG.getConstant(ShCt, dl, ShiftTy)));
           }
         } else if (Cond == ISD::SETEQ && C1 == AndRHS->getAPIntValue()) {
           // (X & 8) == 8  -->  (X & 8) >> 3
           // Perform the xform if C1 is a single bit.
-          if (C1.isPowerOf2()) {
+          unsigned ShCt = C1.logBase2();
+          if (C1.isPowerOf2() &&
+              ShCt <= TLI.getShiftAmountThreshold(ShValTy)) {
             return DAG.getNode(ISD::TRUNCATE, dl, VT,
-                               DAG.getNode(ISD::SRL, dl, N0.getValueType(), N0,
-                                      DAG.getConstant(C1.logBase2(), dl,
-                                                      ShiftTy)));
+                               DAG.getNode(ISD::SRL, dl, ShValTy, N0,
+                                           DAG.getConstant(ShCt, dl, ShiftTy)));
           }
         }
       }
@@ -3452,15 +3854,21 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   }
 
   // Fold remainder of division by a constant.
-  if (N0.getOpcode() == ISD::UREM && N0.hasOneUse() &&
-      (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+  if ((N0.getOpcode() == ISD::UREM || N0.getOpcode() == ISD::SREM) &&
+      N0.hasOneUse() && (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
     AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes();
 
     // When division is cheap or optimizing for minimum size,
     // fall through to DIVREM creation by skipping this fold.
-    if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttribute(Attribute::MinSize))
-      if (SDValue Folded = buildUREMEqFold(VT, N0, N1, Cond, DCI, dl))
-        return Folded;
+    if (!isIntDivCheap(VT, Attr) && !Attr.hasFnAttribute(Attribute::MinSize)) {
+      if (N0.getOpcode() == ISD::UREM) {
+        if (SDValue Folded = buildUREMEqFold(VT, N0, N1, Cond, DCI, dl))
+          return Folded;
+      } else if (N0.getOpcode() == ISD::SREM) {
+        if (SDValue Folded = buildSREMEqFold(VT, N0, N1, Cond, DCI, dl))
+          return Folded;
+      }
+    }
   }
 
   // Fold away ALL boolean setcc's.
@@ -3567,15 +3975,17 @@ TargetLowering::getConstraintType(StringRef Constraint) const {
   if (S == 1) {
     switch (Constraint[0]) {
     default: break;
-    case 'r': return C_RegisterClass;
+    case 'r':
+      return C_RegisterClass;
     case 'm': // memory
     case 'o': // offsetable
     case 'V': // not offsetable
       return C_Memory;
-    case 'i': // Simple Integer or Relocatable Constant
     case 'n': // Simple Integer
     case 'E': // Floating Point Constant
     case 'F': // Floating Point Constant
+      return C_Immediate;
+    case 'i': // Simple Integer or Relocatable Constant
     case 's': // Relocatable Constant
     case 'p': // Address.
     case 'X': // Allow ANY value.
@@ -3950,6 +4360,7 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
 /// Return an integer indicating how general CT is.
 static unsigned getConstraintGenerality(TargetLowering::ConstraintType CT) {
   switch (CT) {
+  case TargetLowering::C_Immediate:
   case TargetLowering::C_Other:
   case TargetLowering::C_Unknown:
     return 0;
@@ -4069,11 +4480,12 @@ static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
     TargetLowering::ConstraintType CType =
       TLI.getConstraintType(OpInfo.Codes[i]);
 
-    // If this is an 'other' constraint, see if the operand is valid for it.
-    // For example, on X86 we might have an 'rI' constraint.  If the operand
-    // is an integer in the range [0..31] we want to use I (saving a load
-    // of a register), otherwise we must use 'r'.
-    if (CType == TargetLowering::C_Other && Op.getNode()) {
+    // If this is an 'other' or 'immediate' constraint, see if the operand is
+    // valid for it. For example, on X86 we might have an 'rI' constraint. If
+    // the operand is an integer in the range [0..31] we want to use I (saving a
+    // load of a register), otherwise we must use 'r'.
+    if ((CType == TargetLowering::C_Other ||
+         CType == TargetLowering::C_Immediate) && Op.getNode()) {
       assert(OpInfo.Codes[i].size() == 1 &&
              "Unhandled multi-letter 'other' constraint");
       std::vector<SDValue> ResultOps;
@@ -4455,6 +4867,34 @@ SDValue TargetLowering::BuildUDIV(SDNode *N, SelectionDAG &DAG,
   return DAG.getSelect(dl, VT, IsOne, N0, Q);
 }
 
+/// If all values in Values that *don't* match the predicate are same 'splat'
+/// value, then replace all values with that splat value.
+/// Else, if AlternativeReplacement was provided, then replace all values that
+/// do match predicate with AlternativeReplacement value.
+static void
+turnVectorIntoSplatVector(MutableArrayRef<SDValue> Values,
+                          std::function<bool(SDValue)> Predicate,
+                          SDValue AlternativeReplacement = SDValue()) {
+  SDValue Replacement;
+  // Is there a value for which the Predicate does *NOT* match? What is it?
+  auto SplatValue = llvm::find_if_not(Values, Predicate);
+  if (SplatValue != Values.end()) {
+    // Does Values consist only of SplatValue's and values matching Predicate?
+    if (llvm::all_of(Values, [Predicate, SplatValue](SDValue Value) {
+          return Value == *SplatValue || Predicate(Value);
+        })) // Then we shall replace values matching predicate with SplatValue.
+      Replacement = *SplatValue;
+  }
+  if (!Replacement) {
+    // Oops, we did not find the "baseline" splat value.
+    if (!AlternativeReplacement)
+      return; // Nothing to do.
+    // Let's replace with provided value then.
+    Replacement = AlternativeReplacement;
+  }
+  std::replace_if(Values.begin(), Values.end(), Predicate, Replacement);
+}
+
 /// Given an ISD::UREM used only by an ISD::SETEQ or ISD::SETNE
 /// where the divisor is constant and the comparison target is zero,
 /// return a DAG expression that will generate the same comparison result
@@ -4482,77 +4922,409 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
                                   DAGCombinerInfo &DCI, const SDLoc &DL,
                                   SmallVectorImpl<SDNode *> &Created) const {
   // fold (seteq/ne (urem N, D), 0) -> (setule/ugt (rotr (mul N, P), K), Q)
-  // - D must be constant with D = D0 * 2^K where D0 is odd and D0 != 1
+  // - D must be constant, with D = D0 * 2^K where D0 is odd
   // - P is the multiplicative inverse of D0 modulo 2^W
-  // - Q = floor((2^W - 1) / D0)
+  // - Q = floor(((2^W) - 1) / D)
   // where W is the width of the common type of N and D.
   assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
          "Only applicable for (in)equality comparisons.");
 
+  SelectionDAG &DAG = DCI.DAG;
+
   EVT VT = REMNode.getValueType();
+  EVT SVT = VT.getScalarType();
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  EVT ShSVT = ShVT.getScalarType();
 
   // If MUL is unavailable, we cannot proceed in any case.
   if (!isOperationLegalOrCustom(ISD::MUL, VT))
     return SDValue();
 
-  // TODO: Add non-uniform constant support.
-  ConstantSDNode *Divisor = isConstOrConstSplat(REMNode->getOperand(1));
+  // TODO: Could support comparing with non-zero too.
   ConstantSDNode *CompTarget = isConstOrConstSplat(CompTargetNode);
-  if (!Divisor || !CompTarget || Divisor->isNullValue() ||
-      !CompTarget->isNullValue())
+  if (!CompTarget || !CompTarget->isNullValue())
     return SDValue();
 
-  const APInt &D = Divisor->getAPIntValue();
+  bool HadOneDivisor = false;
+  bool AllDivisorsAreOnes = true;
+  bool HadEvenDivisor = false;
+  bool AllDivisorsArePowerOfTwo = true;
+  SmallVector<SDValue, 16> PAmts, KAmts, QAmts;
+
+  auto BuildUREMPattern = [&](ConstantSDNode *C) {
+    // Division by 0 is UB. Leave it to be constant-folded elsewhere.
+    if (C->isNullValue())
+      return false;
+
+    const APInt &D = C->getAPIntValue();
+    // If all divisors are ones, we will prefer to avoid the fold.
+    HadOneDivisor |= D.isOneValue();
+    AllDivisorsAreOnes &= D.isOneValue();
+
+    // Decompose D into D0 * 2^K
+    unsigned K = D.countTrailingZeros();
+    assert((!D.isOneValue() || (K == 0)) && "For divisor '1' we won't rotate.");
+    APInt D0 = D.lshr(K);
+
+    // D is even if it has trailing zeros.
+    HadEvenDivisor |= (K != 0);
+    // D is a power-of-two if D0 is one.
+    // If all divisors are power-of-two, we will prefer to avoid the fold.
+    AllDivisorsArePowerOfTwo &= D0.isOneValue();
+
+    // P = inv(D0, 2^W)
+    // 2^W requires W + 1 bits, so we have to extend and then truncate.
+    unsigned W = D.getBitWidth();
+    APInt P = D0.zext(W + 1)
+                  .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
+                  .trunc(W);
+    assert(!P.isNullValue() && "No multiplicative inverse!"); // unreachable
+    assert((D0 * P).isOneValue() && "Multiplicative inverse sanity check.");
+
+    // Q = floor((2^W - 1) / D)
+    APInt Q = APInt::getAllOnesValue(W).udiv(D);
+
+    assert(APInt::getAllOnesValue(ShSVT.getSizeInBits()).ugt(K) &&
+           "We are expecting that K is always less than all-ones for ShSVT");
+
+    // If the divisor is 1 the result can be constant-folded.
+    if (D.isOneValue()) {
+      // Set P and K amount to a bogus values so we can try to splat them.
+      P = 0;
+      K = -1;
+      assert(Q.isAllOnesValue() &&
+             "Expecting all-ones comparison for one divisor");
+    }
+
+    PAmts.push_back(DAG.getConstant(P, DL, SVT));
+    KAmts.push_back(
+        DAG.getConstant(APInt(ShSVT.getSizeInBits(), K), DL, ShSVT));
+    QAmts.push_back(DAG.getConstant(Q, DL, SVT));
+    return true;
+  };
+
+  SDValue N = REMNode.getOperand(0);
+  SDValue D = REMNode.getOperand(1);
 
-  // Decompose D into D0 * 2^K
-  unsigned K = D.countTrailingZeros();
-  bool DivisorIsEven = (K != 0);
-  APInt D0 = D.lshr(K);
+  // Collect the values from each element.
+  if (!ISD::matchUnaryPredicate(D, BuildUREMPattern))
+    return SDValue();
 
-  // The fold is invalid when D0 == 1.
-  // This is reachable because visitSetCC happens before visitREM.
-  if (D0.isOneValue())
+  // If this is a urem by a one, avoid the fold since it can be constant-folded.
+  if (AllDivisorsAreOnes)
     return SDValue();
 
-  // P = inv(D0, 2^W)
-  // 2^W requires W + 1 bits, so we have to extend and then truncate.
-  unsigned W = D.getBitWidth();
-  APInt P = D0.zext(W + 1)
-                .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
-                .trunc(W);
-  assert(!P.isNullValue() && "No multiplicative inverse!"); // unreachable
-  assert((D0 * P).isOneValue() && "Multiplicative inverse sanity check.");
+  // If this is a urem by a powers-of-two, avoid the fold since it can be
+  // best implemented as a bit test.
+  if (AllDivisorsArePowerOfTwo)
+    return SDValue();
 
-  // Q = floor((2^W - 1) / D)
-  APInt Q = APInt::getAllOnesValue(W).udiv(D);
+  SDValue PVal, KVal, QVal;
+  if (VT.isVector()) {
+    if (HadOneDivisor) {
+      // Try to turn PAmts into a splat, since we don't care about the values
+      // that are currently '0'. If we can't, just keep '0'`s.
+      turnVectorIntoSplatVector(PAmts, isNullConstant);
+      // Try to turn KAmts into a splat, since we don't care about the values
+      // that are currently '-1'. If we can't, change them to '0'`s.
+      turnVectorIntoSplatVector(KAmts, isAllOnesConstant,
+                                DAG.getConstant(0, DL, ShSVT));
+    }
 
-  SelectionDAG &DAG = DCI.DAG;
+    PVal = DAG.getBuildVector(VT, DL, PAmts);
+    KVal = DAG.getBuildVector(ShVT, DL, KAmts);
+    QVal = DAG.getBuildVector(VT, DL, QAmts);
+  } else {
+    PVal = PAmts[0];
+    KVal = KAmts[0];
+    QVal = QAmts[0];
+  }
 
-  SDValue PVal = DAG.getConstant(P, DL, VT);
-  SDValue QVal = DAG.getConstant(Q, DL, VT);
   // (mul N, P)
-  SDValue Op1 = DAG.getNode(ISD::MUL, DL, VT, REMNode->getOperand(0), PVal);
-  Created.push_back(Op1.getNode());
+  SDValue Op0 = DAG.getNode(ISD::MUL, DL, VT, N, PVal);
+  Created.push_back(Op0.getNode());
 
-  // Rotate right only if D was even.
-  if (DivisorIsEven) {
+  // Rotate right only if any divisor was even. We avoid rotates for all-odd
+  // divisors as a performance improvement, since rotating by 0 is a no-op.
+  if (HadEvenDivisor) {
     // We need ROTR to do this.
     if (!isOperationLegalOrCustom(ISD::ROTR, VT))
       return SDValue();
-    SDValue ShAmt =
-        DAG.getConstant(K, DL, getShiftAmountTy(VT, DAG.getDataLayout()));
     SDNodeFlags Flags;
     Flags.setExact(true);
     // UREM: (rotr (mul N, P), K)
-    Op1 = DAG.getNode(ISD::ROTR, DL, VT, Op1, ShAmt, Flags);
-    Created.push_back(Op1.getNode());
+    Op0 = DAG.getNode(ISD::ROTR, DL, VT, Op0, KVal, Flags);
+    Created.push_back(Op0.getNode());
   }
 
   // UREM: (setule/setugt (rotr (mul N, P), K), Q)
-  return DAG.getSetCC(DL, SETCCVT, Op1, QVal,
+  return DAG.getSetCC(DL, SETCCVT, Op0, QVal,
                       ((Cond == ISD::SETEQ) ? ISD::SETULE : ISD::SETUGT));
 }
 
+/// Given an ISD::SREM used only by an ISD::SETEQ or ISD::SETNE
+/// where the divisor is constant and the comparison target is zero,
+/// return a DAG expression that will generate the same comparison result
+/// using only multiplications, additions and shifts/rotations.
+/// Ref: "Hacker's Delight" 10-17.
+SDValue TargetLowering::buildSREMEqFold(EVT SETCCVT, SDValue REMNode,
+                                        SDValue CompTargetNode,
+                                        ISD::CondCode Cond,
+                                        DAGCombinerInfo &DCI,
+                                        const SDLoc &DL) const {
+  SmallVector<SDNode *, 7> Built;
+  if (SDValue Folded = prepareSREMEqFold(SETCCVT, REMNode, CompTargetNode, Cond,
+                                         DCI, DL, Built)) {
+    assert(Built.size() <= 7 && "Max size prediction failed.");
+    for (SDNode *N : Built)
+      DCI.AddToWorklist(N);
+    return Folded;
+  }
+
+  return SDValue();
+}
+
+SDValue
+TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
+                                  SDValue CompTargetNode, ISD::CondCode Cond,
+                                  DAGCombinerInfo &DCI, const SDLoc &DL,
+                                  SmallVectorImpl<SDNode *> &Created) const {
+  // Fold:
+  //   (seteq/ne (srem N, D), 0)
+  // To:
+  //   (setule/ugt (rotr (add (mul N, P), A), K), Q)
+  //
+  // - D must be constant, with D = D0 * 2^K where D0 is odd
+  // - P is the multiplicative inverse of D0 modulo 2^W
+  // - A = bitwiseand(floor((2^(W - 1) - 1) / D0), (-(2^k)))
+  // - Q = floor((2 * A) / (2^K))
+  // where W is the width of the common type of N and D.
+  assert((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+         "Only applicable for (in)equality comparisons.");
+
+  SelectionDAG &DAG = DCI.DAG;
+
+  EVT VT = REMNode.getValueType();
+  EVT SVT = VT.getScalarType();
+  EVT ShVT = getShiftAmountTy(VT, DAG.getDataLayout());
+  EVT ShSVT = ShVT.getScalarType();
+
+  // If MUL is unavailable, we cannot proceed in any case.
+  if (!isOperationLegalOrCustom(ISD::MUL, VT))
+    return SDValue();
+
+  // TODO: Could support comparing with non-zero too.
+  ConstantSDNode *CompTarget = isConstOrConstSplat(CompTargetNode);
+  if (!CompTarget || !CompTarget->isNullValue())
+    return SDValue();
+
+  bool HadIntMinDivisor = false;
+  bool HadOneDivisor = false;
+  bool AllDivisorsAreOnes = true;
+  bool HadEvenDivisor = false;
+  bool NeedToApplyOffset = false;
+  bool AllDivisorsArePowerOfTwo = true;
+  SmallVector<SDValue, 16> PAmts, AAmts, KAmts, QAmts;
+
+  auto BuildSREMPattern = [&](ConstantSDNode *C) {
+    // Division by 0 is UB. Leave it to be constant-folded elsewhere.
+    if (C->isNullValue())
+      return false;
+
+    // FIXME: we don't fold `rem %X, -C` to `rem %X, C` in DAGCombine.
+
+    // WARNING: this fold is only valid for positive divisors!
+    APInt D = C->getAPIntValue();
+    if (D.isNegative())
+      D.negate(); //  `rem %X, -C` is equivalent to `rem %X, C`
+
+    HadIntMinDivisor |= D.isMinSignedValue();
+
+    // If all divisors are ones, we will prefer to avoid the fold.
+    HadOneDivisor |= D.isOneValue();
+    AllDivisorsAreOnes &= D.isOneValue();
+
+    // Decompose D into D0 * 2^K
+    unsigned K = D.countTrailingZeros();
+    assert((!D.isOneValue() || (K == 0)) && "For divisor '1' we won't rotate.");
+    APInt D0 = D.lshr(K);
+
+    if (!D.isMinSignedValue()) {
+      // D is even if it has trailing zeros; unless it's INT_MIN, in which case
+      // we don't care about this lane in this fold, we'll special-handle it.
+      HadEvenDivisor |= (K != 0);
+    }
+
+    // D is a power-of-two if D0 is one. This includes INT_MIN.
+    // If all divisors are power-of-two, we will prefer to avoid the fold.
+    AllDivisorsArePowerOfTwo &= D0.isOneValue();
+
+    // P = inv(D0, 2^W)
+    // 2^W requires W + 1 bits, so we have to extend and then truncate.
+    unsigned W = D.getBitWidth();
+    APInt P = D0.zext(W + 1)
+                  .multiplicativeInverse(APInt::getSignedMinValue(W + 1))
+                  .trunc(W);
+    assert(!P.isNullValue() && "No multiplicative inverse!"); // unreachable
+    assert((D0 * P).isOneValue() && "Multiplicative inverse sanity check.");
+
+    // A = floor((2^(W - 1) - 1) / D0) & -2^K
+    APInt A = APInt::getSignedMaxValue(W).udiv(D0);
+    A.clearLowBits(K);
+
+    if (!D.isMinSignedValue()) {
+      // If divisor INT_MIN, then we don't care about this lane in this fold,
+      // we'll special-handle it.
+      NeedToApplyOffset |= A != 0;
+    }
+
+    // Q = floor((2 * A) / (2^K))
+    APInt Q = (2 * A).udiv(APInt::getOneBitSet(W, K));
+
+    assert(APInt::getAllOnesValue(SVT.getSizeInBits()).ugt(A) &&
+           "We are expecting that A is always less than all-ones for SVT");
+    assert(APInt::getAllOnesValue(ShSVT.getSizeInBits()).ugt(K) &&
+           "We are expecting that K is always less than all-ones for ShSVT");
+
+    // If the divisor is 1 the result can be constant-folded. Likewise, we
+    // don't care about INT_MIN lanes, those can be set to undef if appropriate.
+    if (D.isOneValue()) {
+      // Set P, A and K to a bogus values so we can try to splat them.
+      P = 0;
+      A = -1;
+      K = -1;
+
+      // x ?% 1 == 0  <-->  true  <-->  x u<= -1
+      Q = -1;
+    }
+
+    PAmts.push_back(DAG.getConstant(P, DL, SVT));
+    AAmts.push_back(DAG.getConstant(A, DL, SVT));
+    KAmts.push_back(
+        DAG.getConstant(APInt(ShSVT.getSizeInBits(), K), DL, ShSVT));
+    QAmts.push_back(DAG.getConstant(Q, DL, SVT));
+    return true;
+  };
+
+  SDValue N = REMNode.getOperand(0);
+  SDValue D = REMNode.getOperand(1);
+
+  // Collect the values from each element.
+  if (!ISD::matchUnaryPredicate(D, BuildSREMPattern))
+    return SDValue();
+
+  // If this is a srem by a one, avoid the fold since it can be constant-folded.
+  if (AllDivisorsAreOnes)
+    return SDValue();
+
+  // If this is a srem by a powers-of-two (including INT_MIN), avoid the fold
+  // since it can be best implemented as a bit test.
+  if (AllDivisorsArePowerOfTwo)
+    return SDValue();
+
+  SDValue PVal, AVal, KVal, QVal;
+  if (VT.isVector()) {
+    if (HadOneDivisor) {
+      // Try to turn PAmts into a splat, since we don't care about the values
+      // that are currently '0'. If we can't, just keep '0'`s.
+      turnVectorIntoSplatVector(PAmts, isNullConstant);
+      // Try to turn AAmts into a splat, since we don't care about the
+      // values that are currently '-1'. If we can't, change them to '0'`s.
+      turnVectorIntoSplatVector(AAmts, isAllOnesConstant,
+                                DAG.getConstant(0, DL, SVT));
+      // Try to turn KAmts into a splat, since we don't care about the values
+      // that are currently '-1'. If we can't, change them to '0'`s.
+      turnVectorIntoSplatVector(KAmts, isAllOnesConstant,
+                                DAG.getConstant(0, DL, ShSVT));
+    }
+
+    PVal = DAG.getBuildVector(VT, DL, PAmts);
+    AVal = DAG.getBuildVector(VT, DL, AAmts);
+    KVal = DAG.getBuildVector(ShVT, DL, KAmts);
+    QVal = DAG.getBuildVector(VT, DL, QAmts);
+  } else {
+    PVal = PAmts[0];
+    AVal = AAmts[0];
+    KVal = KAmts[0];
+    QVal = QAmts[0];
+  }
+
+  // (mul N, P)
+  SDValue Op0 = DAG.getNode(ISD::MUL, DL, VT, N, PVal);
+  Created.push_back(Op0.getNode());
+
+  if (NeedToApplyOffset) {
+    // We need ADD to do this.
+    if (!isOperationLegalOrCustom(ISD::ADD, VT))
+      return SDValue();
+
+    // (add (mul N, P), A)
+    Op0 = DAG.getNode(ISD::ADD, DL, VT, Op0, AVal);
+    Created.push_back(Op0.getNode());
+  }
+
+  // Rotate right only if any divisor was even. We avoid rotates for all-odd
+  // divisors as a performance improvement, since rotating by 0 is a no-op.
+  if (HadEvenDivisor) {
+    // We need ROTR to do this.
+    if (!isOperationLegalOrCustom(ISD::ROTR, VT))
+      return SDValue();
+    SDNodeFlags Flags;
+    Flags.setExact(true);
+    // SREM: (rotr (add (mul N, P), A), K)
+    Op0 = DAG.getNode(ISD::ROTR, DL, VT, Op0, KVal, Flags);
+    Created.push_back(Op0.getNode());
+  }
+
+  // SREM: (setule/setugt (rotr (add (mul N, P), A), K), Q)
+  SDValue Fold =
+      DAG.getSetCC(DL, SETCCVT, Op0, QVal,
+                   ((Cond == ISD::SETEQ) ? ISD::SETULE : ISD::SETUGT));
+
+  // If we didn't have lanes with INT_MIN divisor, then we're done.
+  if (!HadIntMinDivisor)
+    return Fold;
+
+  // That fold is only valid for positive divisors. Which effectively means,
+  // it is invalid for INT_MIN divisors. So if we have such a lane,
+  // we must fix-up results for said lanes.
+  assert(VT.isVector() && "Can/should only get here for vectors.");
+
+  if (!isOperationLegalOrCustom(ISD::SETEQ, VT) ||
+      !isOperationLegalOrCustom(ISD::AND, VT) ||
+      !isOperationLegalOrCustom(Cond, VT) ||
+      !isOperationLegalOrCustom(ISD::VSELECT, VT))
+    return SDValue();
+
+  Created.push_back(Fold.getNode());
+
+  SDValue IntMin = DAG.getConstant(
+      APInt::getSignedMinValue(SVT.getScalarSizeInBits()), DL, VT);
+  SDValue IntMax = DAG.getConstant(
+      APInt::getSignedMaxValue(SVT.getScalarSizeInBits()), DL, VT);
+  SDValue Zero =
+      DAG.getConstant(APInt::getNullValue(SVT.getScalarSizeInBits()), DL, VT);
+
+  // Which lanes had INT_MIN divisors? Divisor is constant, so const-folded.
+  SDValue DivisorIsIntMin = DAG.getSetCC(DL, SETCCVT, D, IntMin, ISD::SETEQ);
+  Created.push_back(DivisorIsIntMin.getNode());
+
+  // (N s% INT_MIN) ==/!= 0  <-->  (N & INT_MAX) ==/!= 0
+  SDValue Masked = DAG.getNode(ISD::AND, DL, VT, N, IntMax);
+  Created.push_back(Masked.getNode());
+  SDValue MaskedIsZero = DAG.getSetCC(DL, SETCCVT, Masked, Zero, Cond);
+  Created.push_back(MaskedIsZero.getNode());
+
+  // To produce final result we need to blend 2 vectors: 'SetCC' and
+  // 'MaskedIsZero'. If the divisor for channel was *NOT* INT_MIN, we pick
+  // from 'Fold', else pick from 'MaskedIsZero'. Since 'DivisorIsIntMin' is
+  // constant-folded, select can get lowered to a shuffle with constant mask.
+  SDValue Blended =
+      DAG.getNode(ISD::VSELECT, DL, VT, DivisorIsIntMin, MaskedIsZero, Fold);
+
+  return Blended;
+}
+
 bool TargetLowering::
 verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const {
   if (!isa<ConstantSDNode>(Op.getOperand(0))) {
@@ -4564,6 +5336,246 @@ verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const {
   return false;
 }
 
+char TargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG,
+                                        bool LegalOperations, bool ForCodeSize,
+                                        unsigned Depth) const {
+  // fneg is removable even if it has multiple uses.
+  if (Op.getOpcode() == ISD::FNEG)
+    return 2;
+
+  // Don't allow anything with multiple uses unless we know it is free.
+  EVT VT = Op.getValueType();
+  const SDNodeFlags Flags = Op->getFlags();
+  const TargetOptions &Options = DAG.getTarget().Options;
+  if (!Op.hasOneUse() && !(Op.getOpcode() == ISD::FP_EXTEND &&
+                           isFPExtFree(VT, Op.getOperand(0).getValueType())))
+    return 0;
+
+  // Don't recurse exponentially.
+  if (Depth > SelectionDAG::MaxRecursionDepth)
+    return 0;
+
+  switch (Op.getOpcode()) {
+  case ISD::ConstantFP: {
+    if (!LegalOperations)
+      return 1;
+
+    // Don't invert constant FP values after legalization unless the target says
+    // the negated constant is legal.
+    return isOperationLegal(ISD::ConstantFP, VT) ||
+           isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT,
+                        ForCodeSize);
+  }
+  case ISD::BUILD_VECTOR: {
+    // Only permit BUILD_VECTOR of constants.
+    if (llvm::any_of(Op->op_values(), [&](SDValue N) {
+          return !N.isUndef() && !isa<ConstantFPSDNode>(N);
+        }))
+      return 0;
+    if (!LegalOperations)
+      return 1;
+    if (isOperationLegal(ISD::ConstantFP, VT) &&
+        isOperationLegal(ISD::BUILD_VECTOR, VT))
+      return 1;
+    return llvm::all_of(Op->op_values(), [&](SDValue N) {
+      return N.isUndef() ||
+             isFPImmLegal(neg(cast<ConstantFPSDNode>(N)->getValueAPF()), VT,
+                          ForCodeSize);
+    });
+  }
+  case ISD::FADD:
+    if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
+      return 0;
+
+    // After operation legalization, it might not be legal to create new FSUBs.
+    if (LegalOperations && !isOperationLegalOrCustom(ISD::FSUB, VT))
+      return 0;
+
+    // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
+    if (char V = isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations,
+                                    ForCodeSize, Depth + 1))
+      return V;
+    // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
+    return isNegatibleForFree(Op.getOperand(1), DAG, LegalOperations,
+                              ForCodeSize, Depth + 1);
+  case ISD::FSUB:
+    // We can't turn -(A-B) into B-A when we honor signed zeros.
+    if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
+      return 0;
+
+    // fold (fneg (fsub A, B)) -> (fsub B, A)
+    return 1;
+
+  case ISD::FMUL:
+  case ISD::FDIV:
+    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y))
+    if (char V = isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations,
+                                    ForCodeSize, Depth + 1))
+      return V;
+
+    // Ignore X * 2.0 because that is expected to be canonicalized to X + X.
+    if (auto *C = isConstOrConstSplatFP(Op.getOperand(1)))
+      if (C->isExactlyValue(2.0) && Op.getOpcode() == ISD::FMUL)
+        return 0;
+
+    return isNegatibleForFree(Op.getOperand(1), DAG, LegalOperations,
+                              ForCodeSize, Depth + 1);
+
+  case ISD::FMA:
+  case ISD::FMAD: {
+    if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
+      return 0;
+
+    // fold (fneg (fma X, Y, Z)) -> (fma (fneg X), Y, (fneg Z))
+    // fold (fneg (fma X, Y, Z)) -> (fma X, (fneg Y), (fneg Z))
+    char V2 = isNegatibleForFree(Op.getOperand(2), DAG, LegalOperations,
+                                 ForCodeSize, Depth + 1);
+    if (!V2)
+      return 0;
+
+    // One of Op0/Op1 must be cheaply negatible, then select the cheapest.
+    char V0 = isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations,
+                                 ForCodeSize, Depth + 1);
+    char V1 = isNegatibleForFree(Op.getOperand(1), DAG, LegalOperations,
+                                 ForCodeSize, Depth + 1);
+    char V01 = std::max(V0, V1);
+    return V01 ? std::max(V01, V2) : 0;
+  }
+
+  case ISD::FP_EXTEND:
+  case ISD::FP_ROUND:
+  case ISD::FSIN:
+    return isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations,
+                              ForCodeSize, Depth + 1);
+  }
+
+  return 0;
+}
+
+SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+                                             bool LegalOperations,
+                                             bool ForCodeSize,
+                                             unsigned Depth) const {
+  // fneg is removable even if it has multiple uses.
+  if (Op.getOpcode() == ISD::FNEG)
+    return Op.getOperand(0);
+
+  assert(Depth <= SelectionDAG::MaxRecursionDepth &&
+         "getNegatedExpression doesn't match isNegatibleForFree");
+  const SDNodeFlags Flags = Op->getFlags();
+
+  switch (Op.getOpcode()) {
+  case ISD::ConstantFP: {
+    APFloat V = cast<ConstantFPSDNode>(Op)->getValueAPF();
+    V.changeSign();
+    return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType());
+  }
+  case ISD::BUILD_VECTOR: {
+    SmallVector<SDValue, 4> Ops;
+    for (SDValue C : Op->op_values()) {
+      if (C.isUndef()) {
+        Ops.push_back(C);
+        continue;
+      }
+      APFloat V = cast<ConstantFPSDNode>(C)->getValueAPF();
+      V.changeSign();
+      Ops.push_back(DAG.getConstantFP(V, SDLoc(Op), C.getValueType()));
+    }
+    return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Ops);
+  }
+  case ISD::FADD:
+    assert((DAG.getTarget().Options.NoSignedZerosFPMath ||
+            Flags.hasNoSignedZeros()) &&
+           "Expected NSZ fp-flag");
+
+    // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
+    if (isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations, ForCodeSize,
+                           Depth + 1))
+      return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
+                         getNegatedExpression(Op.getOperand(0), DAG,
+                                              LegalOperations, ForCodeSize,
+                                              Depth + 1),
+                         Op.getOperand(1), Flags);
+    // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
+    return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
+                       getNegatedExpression(Op.getOperand(1), DAG,
+                                            LegalOperations, ForCodeSize,
+                                            Depth + 1),
+                       Op.getOperand(0), Flags);
+  case ISD::FSUB:
+    // fold (fneg (fsub 0, B)) -> B
+    if (ConstantFPSDNode *N0CFP =
+            isConstOrConstSplatFP(Op.getOperand(0), /*AllowUndefs*/ true))
+      if (N0CFP->isZero())
+        return Op.getOperand(1);
+
+    // fold (fneg (fsub A, B)) -> (fsub B, A)
+    return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(0), Flags);
+
+  case ISD::FMUL:
+  case ISD::FDIV:
+    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
+    if (isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations, ForCodeSize,
+                           Depth + 1))
+      return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
+                         getNegatedExpression(Op.getOperand(0), DAG,
+                                              LegalOperations, ForCodeSize,
+                                              Depth + 1),
+                         Op.getOperand(1), Flags);
+
+    // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y))
+    return DAG.getNode(
+        Op.getOpcode(), SDLoc(Op), Op.getValueType(), Op.getOperand(0),
+        getNegatedExpression(Op.getOperand(1), DAG, LegalOperations,
+                             ForCodeSize, Depth + 1),
+        Flags);
+
+  case ISD::FMA:
+  case ISD::FMAD: {
+    assert((DAG.getTarget().Options.NoSignedZerosFPMath ||
+            Flags.hasNoSignedZeros()) &&
+           "Expected NSZ fp-flag");
+
+    SDValue Neg2 = getNegatedExpression(Op.getOperand(2), DAG, LegalOperations,
+                                        ForCodeSize, Depth + 1);
+
+    char V0 = isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations,
+                                 ForCodeSize, Depth + 1);
+    char V1 = isNegatibleForFree(Op.getOperand(1), DAG, LegalOperations,
+                                 ForCodeSize, Depth + 1);
+    if (V0 >= V1) {
+      // fold (fneg (fma X, Y, Z)) -> (fma (fneg X), Y, (fneg Z))
+      SDValue Neg0 = getNegatedExpression(
+          Op.getOperand(0), DAG, LegalOperations, ForCodeSize, Depth + 1);
+      return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), Neg0,
+                         Op.getOperand(1), Neg2, Flags);
+    }
+
+    // fold (fneg (fma X, Y, Z)) -> (fma X, (fneg Y), (fneg Z))
+    SDValue Neg1 = getNegatedExpression(Op.getOperand(1), DAG, LegalOperations,
+                                        ForCodeSize, Depth + 1);
+    return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(0), Neg1, Neg2, Flags);
+  }
+
+  case ISD::FP_EXTEND:
+  case ISD::FSIN:
+    return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
+                       getNegatedExpression(Op.getOperand(0), DAG,
+                                            LegalOperations, ForCodeSize,
+                                            Depth + 1));
+  case ISD::FP_ROUND:
+    return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(),
+                       getNegatedExpression(Op.getOperand(0), DAG,
+                                            LegalOperations, ForCodeSize,
+                                            Depth + 1),
+                       Op.getOperand(1));
+  }
+
+  llvm_unreachable("Unknown code");
+}
+
 //===----------------------------------------------------------------------===//
 // Legalization Utilities
 //===----------------------------------------------------------------------===//
@@ -4862,7 +5874,8 @@ bool TargetLowering::expandROT(SDNode *Node, SDValue &Result,
 
 bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
                                       SelectionDAG &DAG) const {
-  SDValue Src = Node->getOperand(0);
+  unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0;
+  SDValue Src = Node->getOperand(OpNo);
   EVT SrcVT = Src.getValueType();
   EVT DstVT = Node->getValueType(0);
   SDLoc dl(SDValue(Node, 0));
@@ -4871,6 +5884,13 @@ bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
   if (SrcVT != MVT::f32 || DstVT != MVT::i64)
     return false;
 
+  if (Node->isStrictFPOpcode())
+    // When a NaN is converted to an integer a trap is allowed. We can't
+    // use this expansion here because it would eliminate that trap. Other
+    // traps are also allowed and cannot be eliminated. See 
+    // IEEE 754-2008 sec 5.8.
+    return false;
+
   // Expand f32 -> i64 conversion
   // This algorithm comes from compiler-rt's implementation of fixsfdi:
   // https://github.com/llvm/llvm-project/blob/master/compiler-rt/lib/builtins/fixsfdi.c
@@ -4924,9 +5944,11 @@ bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
 }
 
 bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
+                                      SDValue &Chain,
                                       SelectionDAG &DAG) const {
   SDLoc dl(SDValue(Node, 0));
-  SDValue Src = Node->getOperand(0);
+  unsigned OpNo = Node->isStrictFPOpcode() ? 1 : 0;
+  SDValue Src = Node->getOperand(OpNo);
 
   EVT SrcVT = Src.getValueType();
   EVT DstVT = Node->getValueType(0);
@@ -4934,7 +5956,9 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
       getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
 
   // Only expand vector types if we have the appropriate vector bit operations.
-  if (DstVT.isVector() && (!isOperationLegalOrCustom(ISD::FP_TO_SINT, DstVT) ||
+  unsigned SIntOpcode = Node->isStrictFPOpcode() ? ISD::STRICT_FP_TO_SINT : 
+                                                   ISD::FP_TO_SINT;
+  if (DstVT.isVector() && (!isOperationLegalOrCustom(SIntOpcode, DstVT) ||
                            !isOperationLegalOrCustomOrPromote(ISD::XOR, SrcVT)))
     return false;
 
@@ -4946,14 +5970,21 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
   APInt SignMask = APInt::getSignMask(DstVT.getScalarSizeInBits());
   if (APFloat::opOverflow &
       APF.convertFromAPInt(SignMask, false, APFloat::rmNearestTiesToEven)) {
-    Result = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src);
+    if (Node->isStrictFPOpcode()) {
+      Result = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { DstVT, MVT::Other }, 
+                           { Node->getOperand(0), Src }); 
+      Chain = Result.getValue(1);
+    } else
+      Result = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Src);
     return true;
   }
 
   SDValue Cst = DAG.getConstantFP(APF, dl, SrcVT);
   SDValue Sel = DAG.getSetCC(dl, SetCCVT, Src, Cst, ISD::SETLT);
 
-  bool Strict = shouldUseStrictFP_TO_INT(SrcVT, DstVT, /*IsSigned*/ false);
+  bool Strict = Node->isStrictFPOpcode() ||
+                shouldUseStrictFP_TO_INT(SrcVT, DstVT, /*IsSigned*/ false);
+
   if (Strict) {
     // Expand based on maximum range of FP_TO_SINT, if the value exceeds the
     // signmask then offset (the result of which should be fully representable).
@@ -4963,12 +5994,23 @@ bool TargetLowering::expandFP_TO_UINT(SDNode *Node, SDValue &Result,
     // Result = fp_to_sint(Val) ^ Ofs
 
     // TODO: Should any fast-math-flags be set for the FSUB?
-    SDValue Val = DAG.getSelect(dl, SrcVT, Sel, Src,
-                                DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Cst));
+    SDValue SrcBiased;
+    if (Node->isStrictFPOpcode())
+      SrcBiased = DAG.getNode(ISD::STRICT_FSUB, dl, { SrcVT, MVT::Other }, 
+                              { Node->getOperand(0), Src, Cst });
+    else
+      SrcBiased = DAG.getNode(ISD::FSUB, dl, SrcVT, Src, Cst);
+    SDValue Val = DAG.getSelect(dl, SrcVT, Sel, Src, SrcBiased);
     SDValue Ofs = DAG.getSelect(dl, DstVT, Sel, DAG.getConstant(0, dl, DstVT),
                                 DAG.getConstant(SignMask, dl, DstVT));
-    Result = DAG.getNode(ISD::XOR, dl, DstVT,
-                         DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Val), Ofs);
+    SDValue SInt;
+    if (Node->isStrictFPOpcode()) {
+      SInt = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { DstVT, MVT::Other }, 
+                         { SrcBiased.getValue(1), Val });
+      Chain = SInt.getValue(1);
+    } else
+      SInt = DAG.getNode(ISD::FP_TO_SINT, dl, DstVT, Val);
+    Result = DAG.getNode(ISD::XOR, dl, DstVT, SInt, Ofs);
   } else {
     // Expand based on maximum range of FP_TO_SINT:
     // True = fp_to_sint(Src)
@@ -5918,7 +6960,8 @@ SDValue
 TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
   assert((Node->getOpcode() == ISD::SMULFIX ||
           Node->getOpcode() == ISD::UMULFIX ||
-          Node->getOpcode() == ISD::SMULFIXSAT) &&
+          Node->getOpcode() == ISD::SMULFIXSAT ||
+          Node->getOpcode() == ISD::UMULFIXSAT) &&
          "Expected a fixed point multiplication opcode");
 
   SDLoc dl(Node);
@@ -5926,15 +6969,19 @@ TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
   SDValue RHS = Node->getOperand(1);
   EVT VT = LHS.getValueType();
   unsigned Scale = Node->getConstantOperandVal(2);
-  bool Saturating = Node->getOpcode() == ISD::SMULFIXSAT;
+  bool Saturating = (Node->getOpcode() == ISD::SMULFIXSAT ||
+                     Node->getOpcode() == ISD::UMULFIXSAT);
+  bool Signed = (Node->getOpcode() == ISD::SMULFIX ||
+                 Node->getOpcode() == ISD::SMULFIXSAT);
   EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
   unsigned VTSize = VT.getScalarSizeInBits();
 
   if (!Scale) {
     // [us]mul.fix(a, b, 0) -> mul(a, b)
-    if (!Saturating && isOperationLegalOrCustom(ISD::MUL, VT)) {
-      return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
-    } else if (Saturating && isOperationLegalOrCustom(ISD::SMULO, VT)) {
+    if (!Saturating) {
+      if (isOperationLegalOrCustom(ISD::MUL, VT))
+        return DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+    } else if (Signed && isOperationLegalOrCustom(ISD::SMULO, VT)) {
       SDValue Result =
           DAG.getNode(ISD::SMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
       SDValue Product = Result.getValue(0);
@@ -5948,11 +6995,18 @@ TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
       SDValue ProdNeg = DAG.getSetCC(dl, BoolVT, Product, Zero, ISD::SETLT);
       Result = DAG.getSelect(dl, VT, ProdNeg, SatMax, SatMin);
       return DAG.getSelect(dl, VT, Overflow, Result, Product);
+    } else if (!Signed && isOperationLegalOrCustom(ISD::UMULO, VT)) {
+      SDValue Result =
+          DAG.getNode(ISD::UMULO, dl, DAG.getVTList(VT, BoolVT), LHS, RHS);
+      SDValue Product = Result.getValue(0);
+      SDValue Overflow = Result.getValue(1);
+
+      APInt MaxVal = APInt::getMaxValue(VTSize);
+      SDValue SatMax = DAG.getConstant(MaxVal, dl, VT);
+      return DAG.getSelect(dl, VT, Overflow, SatMax, Product);
     }
   }
 
-  bool Signed =
-      Node->getOpcode() == ISD::SMULFIX || Node->getOpcode() == ISD::SMULFIXSAT;
   assert(((Signed && Scale < VTSize) || (!Signed && Scale <= VTSize)) &&
          "Expected scale to be less than the number of bits if signed or at "
          "most the number of bits if unsigned.");
@@ -5978,7 +7032,8 @@ TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
 
   if (Scale == VTSize)
     // Result is just the top half since we'd be shifting by the width of the
-    // operand.
+    // operand. Overflow impossible so this works for both UMULFIX and
+    // UMULFIXSAT.
     return Hi;
 
   // The result will need to be shifted right by the scale since both operands
@@ -5990,20 +7045,55 @@ TargetLowering::expandFixedPointMul(SDNode *Node, SelectionDAG &DAG) const {
   if (!Saturating)
     return Result;
 
-  unsigned OverflowBits = VTSize - Scale + 1; // +1 for the sign
-  SDValue HiMask =
-      DAG.getConstant(APInt::getHighBitsSet(VTSize, OverflowBits), dl, VT);
-  SDValue LoMask = DAG.getConstant(
-      APInt::getLowBitsSet(VTSize, VTSize - OverflowBits), dl, VT);
-  APInt MaxVal = APInt::getSignedMaxValue(VTSize);
-  APInt MinVal = APInt::getSignedMinValue(VTSize);
-
-  Result = DAG.getSelectCC(dl, Hi, LoMask,
-                           DAG.getConstant(MaxVal, dl, VT), Result,
-                           ISD::SETGT);
-  return DAG.getSelectCC(dl, Hi, HiMask,
-                         DAG.getConstant(MinVal, dl, VT), Result,
-                         ISD::SETLT);
+  if (!Signed) {
+    // Unsigned overflow happened if the upper (VTSize - Scale) bits (of the
+    // widened multiplication) aren't all zeroes.
+
+    // Saturate to max if ((Hi >> Scale) != 0),
+    // which is the same as if (Hi > ((1 << Scale) - 1))
+    APInt MaxVal = APInt::getMaxValue(VTSize);
+    SDValue LowMask = DAG.getConstant(APInt::getLowBitsSet(VTSize, Scale),
+                                      dl, VT);
+    Result = DAG.getSelectCC(dl, Hi, LowMask,
+                             DAG.getConstant(MaxVal, dl, VT), Result,
+                             ISD::SETUGT);
+
+    return Result;
+  }
+
+  // Signed overflow happened if the upper (VTSize - Scale + 1) bits (of the
+  // widened multiplication) aren't all ones or all zeroes.
+
+  SDValue SatMin = DAG.getConstant(APInt::getSignedMinValue(VTSize), dl, VT);
+  SDValue SatMax = DAG.getConstant(APInt::getSignedMaxValue(VTSize), dl, VT);
+
+  if (Scale == 0) {
+    SDValue Sign = DAG.getNode(ISD::SRA, dl, VT, Lo,
+                               DAG.getConstant(VTSize - 1, dl, ShiftTy));
+    SDValue Overflow = DAG.getSetCC(dl, BoolVT, Hi, Sign, ISD::SETNE);
+    // Saturated to SatMin if wide product is negative, and SatMax if wide
+    // product is positive ...
+    SDValue Zero = DAG.getConstant(0, dl, VT);
+    SDValue ResultIfOverflow = DAG.getSelectCC(dl, Hi, Zero, SatMin, SatMax,
+                                               ISD::SETLT);
+    // ... but only if we overflowed.
+    return DAG.getSelect(dl, VT, Overflow, ResultIfOverflow, Result);
+  }
+
+  //  We handled Scale==0 above so all the bits to examine is in Hi.
+
+  // Saturate to max if ((Hi >> (Scale - 1)) > 0),
+  // which is the same as if (Hi > (1 << (Scale - 1)) - 1)
+  SDValue LowMask = DAG.getConstant(APInt::getLowBitsSet(VTSize, Scale - 1),
+                                    dl, VT);
+  Result = DAG.getSelectCC(dl, Hi, LowMask, SatMax, Result, ISD::SETGT);
+  // Saturate to min if (Hi >> (Scale - 1)) < -1),
+  // which is the same as if (HI < (-1 << (Scale - 1))
+  SDValue HighMask =
+      DAG.getConstant(APInt::getHighBitsSet(VTSize, VTSize - Scale + 1),
+                      dl, VT);
+  Result = DAG.getSelectCC(dl, Hi, HighMask, SatMin, Result, ISD::SETLT);
+  return Result;
 }
 
 void TargetLowering::expandUADDSUBO(
@@ -6060,24 +7150,19 @@ void TargetLowering::expandSADDSUBO(
 
   SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());
 
-  //   LHSSign -> LHS >= 0
-  //   RHSSign -> RHS >= 0
-  //   SumSign -> Result >= 0
-  //
-  //   Add:
-  //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
-  //   Sub:
-  //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
-  SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
-  SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
-  SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
-                                    IsAdd ? ISD::SETEQ : ISD::SETNE);
-
-  SDValue SumSign = DAG.getSetCC(dl, OType, Result, Zero, ISD::SETGE);
-  SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
-
-  SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
-  Overflow = DAG.getBoolExtOrTrunc(Cmp, dl, ResultType, ResultType);
+  // For an addition, the result should be less than one of the operands (LHS)
+  // if and only if the other operand (RHS) is negative, otherwise there will
+  // be overflow.
+  // For a subtraction, the result should be less than one of the operands
+  // (LHS) if and only if the other operand (RHS) is (non-zero) positive,
+  // otherwise there will be overflow.
+  SDValue ResultLowerThanLHS = DAG.getSetCC(dl, OType, Result, LHS, ISD::SETLT);
+  SDValue ConditionRHS =
+      DAG.getSetCC(dl, OType, RHS, Zero, IsAdd ? ISD::SETLT : ISD::SETGT);
+
+  Overflow = DAG.getBoolExtOrTrunc(
+      DAG.getNode(ISD::XOR, dl, OType, ConditionRHS, ResultLowerThanLHS), dl,
+      ResultType, ResultType);
 }
 
 bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,
@@ -6176,20 +7261,19 @@ bool TargetLowering::expandMULO(SDNode *Node, SDValue &Result,
     // being a legal type for the architecture and thus has to be split to
     // two arguments.
     SDValue Ret;
+    TargetLowering::MakeLibCallOptions CallOptions;
+    CallOptions.setSExt(isSigned);
+    CallOptions.setIsPostTypeLegalization(true);
     if (shouldSplitFunctionArgumentsAsLittleEndian(DAG.getDataLayout())) {
       // Halves of WideVT are packed into registers in different order
       // depending on platform endianness. This is usually handled by
       // the C calling convention, but we can't defer to it in
       // the legalizer.
       SDValue Args[] = { LHS, HiLHS, RHS, HiRHS };
-      Ret = makeLibCall(DAG, LC, WideVT, Args, isSigned, dl,
-          /* doesNotReturn */ false, /* isReturnValueUsed */ true,
-          /* isPostTypeLegalization */ true).first;
+      Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first;
     } else {
       SDValue Args[] = { HiLHS, LHS, HiRHS, RHS };
-      Ret = makeLibCall(DAG, LC, WideVT, Args, isSigned, dl,
-          /* doesNotReturn */ false, /* isReturnValueUsed */ true,
-          /* isPostTypeLegalization */ true).first;
+      Ret = makeLibCall(DAG, LC, WideVT, Args, CallOptions, dl).first;
     }
     assert(Ret.getOpcode() == ISD::MERGE_VALUES &&
            "Ret value is a collection of constituent nodes holding result.");
diff --git a/lib/CodeGen/ShrinkWrap.cpp b/lib/CodeGen/ShrinkWrap.cpp
index 2db0ea570598..412a00095b9b 100644
--- a/lib/CodeGen/ShrinkWrap.cpp
+++ b/lib/CodeGen/ShrinkWrap.cpp
@@ -278,11 +278,10 @@ bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI,
       // Ignore instructions like DBG_VALUE which don't read/def the register.
       if (!MO.isDef() && !MO.readsReg())
         continue;
-      unsigned PhysReg = MO.getReg();
+      Register PhysReg = MO.getReg();
       if (!PhysReg)
         continue;
-      assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
-             "Unallocated register?!");
+      assert(Register::isPhysicalRegister(PhysReg) && "Unallocated register?!");
       // The stack pointer is not normally described as a callee-saved register
       // in calling convention definitions, so we need to watch for it
       // separately. An SP mentioned by a call instruction, we can ignore,
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 23e5ce0acae8..db520d4e6403 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -477,7 +477,10 @@ bool SjLjEHPrepare::runOnFunction(Function &F) {
   UnregisterFn = M.getOrInsertFunction(
       "_Unwind_SjLj_Unregister", Type::getVoidTy(M.getContext()),
       PointerType::getUnqual(FunctionContextTy));
-  FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress);
+  FrameAddrFn = Intrinsic::getDeclaration(
+      &M, Intrinsic::frameaddress,
+      {Type::getInt8PtrTy(M.getContext(),
+                          M.getDataLayout().getAllocaAddrSpace())});
   StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
   StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
   BuiltinSetupDispatchFn =
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 5c944fe3f6b3..0c1f1220c421 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "SplitKit.h"
-#include "LiveRangeCalc.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/None.h"
@@ -22,6 +21,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRangeCalc.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
@@ -437,7 +437,7 @@ void SplitEditor::addDeadDef(LiveInterval &LI, VNInfo *VNI, bool Original) {
     assert(DefMI != nullptr);
     LaneBitmask LM;
     for (const MachineOperand &DefOp : DefMI->defs()) {
-      unsigned R = DefOp.getReg();
+      Register R = DefOp.getReg();
       if (R != LI.reg)
         continue;
       if (unsigned SR = DefOp.getSubReg())
@@ -1373,7 +1373,7 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
     assert(LI.hasSubRanges());
 
     LiveRangeCalc SubLRC;
-    unsigned Reg = EP.MO.getReg(), Sub = EP.MO.getSubReg();
+    Register Reg = EP.MO.getReg(), Sub = EP.MO.getSubReg();
     LaneBitmask LM = Sub != 0 ? TRI.getSubRegIndexLaneMask(Sub)
                               : MRI.getMaxLaneMaskForVReg(Reg);
     for (LiveInterval::SubRange &S : LI.subranges()) {
diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h
index 86ad3811e3ad..78f0bbd24db5 100644
--- a/lib/CodeGen/SplitKit.h
+++ b/lib/CodeGen/SplitKit.h
@@ -14,7 +14,6 @@
 #ifndef LLVM_LIB_CODEGEN_SPLITKIT_H
 #define LLVM_LIB_CODEGEN_SPLITKIT_H
 
-#include "LiveRangeCalc.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
@@ -25,6 +24,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRangeCalc.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SlotIndexes.h"
diff --git a/lib/CodeGen/StackMaps.cpp b/lib/CodeGen/StackMaps.cpp
index ae9401b89700..383c91259ffc 100644
--- a/lib/CodeGen/StackMaps.cpp
+++ b/lib/CodeGen/StackMaps.cpp
@@ -113,7 +113,7 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
       unsigned Size = DL.getPointerSizeInBits();
       assert((Size % 8) == 0 && "Need pointer size in bytes.");
       Size /= 8;
-      unsigned Reg = (++MOI)->getReg();
+      Register Reg = (++MOI)->getReg();
       int64_t Imm = (++MOI)->getImm();
       Locs.emplace_back(StackMaps::Location::Direct, Size,
                         getDwarfRegNum(Reg, TRI), Imm);
@@ -122,7 +122,7 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
     case StackMaps::IndirectMemRefOp: {
       int64_t Size = (++MOI)->getImm();
       assert(Size > 0 && "Need a valid size for indirect memory locations.");
-      unsigned Reg = (++MOI)->getReg();
+      Register Reg = (++MOI)->getReg();
       int64_t Imm = (++MOI)->getImm();
       Locs.emplace_back(StackMaps::Location::Indirect, Size,
                         getDwarfRegNum(Reg, TRI), Imm);
@@ -148,14 +148,14 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
     if (MOI->isImplicit())
       return ++MOI;
 
-    assert(TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) &&
+    assert(Register::isPhysicalRegister(MOI->getReg()) &&
            "Virtreg operands should have been rewritten before now.");
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(MOI->getReg());
     assert(!MOI->getSubReg() && "Physical subreg still around.");
 
     unsigned Offset = 0;
     unsigned DwarfRegNum = getDwarfRegNum(MOI->getReg(), TRI);
-    unsigned LLVMRegNum = TRI->getLLVMRegNum(DwarfRegNum, false);
+    unsigned LLVMRegNum = *TRI->getLLVMRegNum(DwarfRegNum, false);
     unsigned SubRegIdx = TRI->getSubRegIndex(LLVMRegNum, MOI->getReg());
     if (SubRegIdx)
       Offset = TRI->getSubRegIdxOffset(SubRegIdx);
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index 809960c7fdf9..5683d1db473c 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -17,7 +17,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/Passes.h"
@@ -157,6 +156,68 @@ bool StackProtector::ContainsProtectableArray(Type *Ty, bool &IsLarge,
   return NeedsProtector;
 }
 
+bool StackProtector::HasAddressTaken(const Instruction *AI) {
+  for (const User *U : AI->users()) {
+    const auto *I = cast<Instruction>(U);
+    switch (I->getOpcode()) {
+    case Instruction::Store:
+      if (AI == cast<StoreInst>(I)->getValueOperand())
+        return true;
+      break;
+    case Instruction::AtomicCmpXchg:
+      // cmpxchg conceptually includes both a load and store from the same
+      // location. So, like store, the value being stored is what matters.
+      if (AI == cast<AtomicCmpXchgInst>(I)->getNewValOperand())
+        return true;
+      break;
+    case Instruction::PtrToInt:
+      if (AI == cast<PtrToIntInst>(I)->getOperand(0))
+        return true;
+      break;
+    case Instruction::Call: {
+      // Ignore intrinsics that do not become real instructions.
+      // TODO: Narrow this to intrinsics that have store-like effects.
+      const auto *CI = cast<CallInst>(I);
+      if (!isa<DbgInfoIntrinsic>(CI) && !CI->isLifetimeStartOrEnd())
+        return true;
+      break;
+    }
+    case Instruction::Invoke:
+      return true;
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::Select:
+    case Instruction::AddrSpaceCast:
+      if (HasAddressTaken(I))
+        return true;
+      break;
+    case Instruction::PHI: {
+      // Keep track of what PHI nodes we have already visited to ensure
+      // they are only visited once.
+      const auto *PN = cast<PHINode>(I);
+      if (VisitedPHIs.insert(PN).second)
+        if (HasAddressTaken(PN))
+          return true;
+      break;
+    }
+    case Instruction::Load:
+    case Instruction::AtomicRMW:
+    case Instruction::Ret:
+      // These instructions take an address operand, but have load-like or
+      // other innocuous behavior that should not trigger a stack protector.
+      // atomicrmw conceptually has both load and store semantics, but the
+      // value being stored must be integer; so if a pointer is being stored,
+      // we'll catch it in the PtrToInt case above.
+      break;
+    default:
+      // Conservatively return true for any instruction that takes an address
+      // operand, but is not handled above.
+      return true;
+    }
+  }
+  return false;
+}
+
 /// Search for the first call to the llvm.stackprotector intrinsic and return it
 /// if present.
 static const CallInst *findStackProtectorIntrinsic(Function &F) {
@@ -264,9 +325,7 @@ bool StackProtector::RequiresStackProtector() {
           continue;
         }
 
-        if (Strong && PointerMayBeCaptured(AI,
-                                           /* ReturnCaptures */ false,
-                                           /* StoreCaptures */ true)) {
+        if (Strong && HasAddressTaken(AI)) {
           ++NumAddrTaken;
           Layout.insert(std::make_pair(AI, MachineFrameInfo::SSPLK_AddrOf));
           ORE.emit([&]() {
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index 99b533e10b87..9c8143c55dc2 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -221,7 +221,7 @@ void StackSlotColoring::InitializeSlots() {
   for (auto *I : Intervals) {
     LiveInterval &li = I->second;
     LLVM_DEBUG(li.dump());
-    int FI = TargetRegisterInfo::stackSlot2Index(li.reg);
+    int FI = Register::stackSlot2Index(li.reg);
     if (MFI->isDeadObjectIndex(FI))
       continue;
 
@@ -268,7 +268,7 @@ StackSlotColoring::OverlapWithAssignments(LiveInterval *li, int Color) const {
 int StackSlotColoring::ColorSlot(LiveInterval *li) {
   int Color = -1;
   bool Share = false;
-  int FI = TargetRegisterInfo::stackSlot2Index(li->reg);
+  int FI = Register::stackSlot2Index(li->reg);
   uint8_t StackID = MFI->getStackID(FI);
 
   if (!DisableSharing) {
@@ -330,7 +330,7 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
   bool Changed = false;
   for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
     LiveInterval *li = SSIntervals[i];
-    int SS = TargetRegisterInfo::stackSlot2Index(li->reg);
+    int SS = Register::stackSlot2Index(li->reg);
     int NewSS = ColorSlot(li);
     assert(NewSS >= 0 && "Stack coloring failed?");
     SlotMapping[SS] = NewSS;
@@ -343,7 +343,7 @@ bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "\nSpill slots after coloring:\n");
   for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
     LiveInterval *li = SSIntervals[i];
-    int SS = TargetRegisterInfo::stackSlot2Index(li->reg);
+    int SS = Register::stackSlot2Index(li->reg);
     li->weight = SlotWeights[SS];
   }
   // Sort them by new weight.
diff --git a/lib/CodeGen/SwiftErrorValueTracking.cpp b/lib/CodeGen/SwiftErrorValueTracking.cpp
index 96821cadb1b6..c72a04276a4f 100644
--- a/lib/CodeGen/SwiftErrorValueTracking.cpp
+++ b/lib/CodeGen/SwiftErrorValueTracking.cpp
@@ -13,9 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/SwiftErrorValueTracking.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallSet.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/Value.h"
diff --git a/lib/CodeGen/TailDuplicator.cpp b/lib/CodeGen/TailDuplicator.cpp
index a0590a8a6cc6..03c68a37e459 100644
--- a/lib/CodeGen/TailDuplicator.cpp
+++ b/lib/CodeGen/TailDuplicator.cpp
@@ -235,8 +235,8 @@ bool TailDuplicator::tailDuplicateAndUpdate(
     MachineInstr *Copy = Copies[i];
     if (!Copy->isCopy())
       continue;
-    unsigned Dst = Copy->getOperand(0).getReg();
-    unsigned Src = Copy->getOperand(1).getReg();
+    Register Dst = Copy->getOperand(0).getReg();
+    Register Src = Copy->getOperand(1).getReg();
     if (MRI->hasOneNonDBGUse(Src) &&
         MRI->constrainRegClass(Src, MRI->getRegClass(Dst))) {
       // Copy is the only use. Do trivial copy propagation here.
@@ -312,7 +312,7 @@ static void getRegsUsedByPHIs(const MachineBasicBlock &BB,
     if (!MI.isPHI())
       break;
     for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
-      unsigned SrcReg = MI.getOperand(i).getReg();
+      Register SrcReg = MI.getOperand(i).getReg();
       UsedByPhi->insert(SrcReg);
     }
   }
@@ -340,17 +340,17 @@ void TailDuplicator::processPHI(
     DenseMap<unsigned, RegSubRegPair> &LocalVRMap,
     SmallVectorImpl<std::pair<unsigned, RegSubRegPair>> &Copies,
     const DenseSet<unsigned> &RegsUsedByPhi, bool Remove) {
-  unsigned DefReg = MI->getOperand(0).getReg();
+  Register DefReg = MI->getOperand(0).getReg();
   unsigned SrcOpIdx = getPHISrcRegOpIdx(MI, PredBB);
   assert(SrcOpIdx && "Unable to find matching PHI source?");
-  unsigned SrcReg = MI->getOperand(SrcOpIdx).getReg();
+  Register SrcReg = MI->getOperand(SrcOpIdx).getReg();
   unsigned SrcSubReg = MI->getOperand(SrcOpIdx).getSubReg();
   const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
   LocalVRMap.insert(std::make_pair(DefReg, RegSubRegPair(SrcReg, SrcSubReg)));
 
   // Insert a copy from source to the end of the block. The def register is the
   // available value liveout of the block.
-  unsigned NewDef = MRI->createVirtualRegister(RC);
+  Register NewDef = MRI->createVirtualRegister(RC);
   Copies.push_back(std::make_pair(NewDef, RegSubRegPair(SrcReg, SrcSubReg)));
   if (isDefLiveOut(DefReg, TailBB, MRI) || RegsUsedByPhi.count(DefReg))
     addSSAUpdateEntry(DefReg, NewDef, PredBB);
@@ -384,12 +384,12 @@ void TailDuplicator::duplicateInstruction(
       MachineOperand &MO = NewMI.getOperand(i);
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
-      if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      Register Reg = MO.getReg();
+      if (!Register::isVirtualRegister(Reg))
         continue;
       if (MO.isDef()) {
         const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-        unsigned NewReg = MRI->createVirtualRegister(RC);
+        Register NewReg = MRI->createVirtualRegister(RC);
         MO.setReg(NewReg);
         LocalVRMap.insert(std::make_pair(Reg, RegSubRegPair(NewReg, 0)));
         if (isDefLiveOut(Reg, TailBB, MRI) || UsedByPhi.count(Reg))
@@ -433,7 +433,7 @@ void TailDuplicator::duplicateInstruction(
             auto *NewRC = MI->getRegClassConstraint(i, TII, TRI);
             if (NewRC == nullptr)
               NewRC = OrigRC;
-            unsigned NewReg = MRI->createVirtualRegister(NewRC);
+            Register NewReg = MRI->createVirtualRegister(NewRC);
             BuildMI(*PredBB, NewMI, NewMI.getDebugLoc(),
                     TII->get(TargetOpcode::COPY), NewReg)
                 .addReg(VI->second.Reg, 0, VI->second.SubReg);
@@ -477,7 +477,7 @@ void TailDuplicator::updateSuccessorsPHIs(
 
       assert(Idx != 0);
       MachineOperand &MO0 = MI.getOperand(Idx);
-      unsigned Reg = MO0.getReg();
+      Register Reg = MO0.getReg();
       if (isDead) {
         // Folded into the previous BB.
         // There could be duplicate phi source entries. FIXME: Should sdisel
diff --git a/lib/CodeGen/TargetFrameLoweringImpl.cpp b/lib/CodeGen/TargetFrameLoweringImpl.cpp
index 9c4483cb240d..9eeacc2584cb 100644
--- a/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -71,7 +72,9 @@ void TargetFrameLowering::determineCalleeSaves(MachineFunction &MF,
 
   // When interprocedural register allocation is enabled caller saved registers
   // are preferred over callee saved registers.
-  if (MF.getTarget().Options.EnableIPRA && isSafeForNoCSROpt(MF.getFunction()))
+  if (MF.getTarget().Options.EnableIPRA &&
+      isSafeForNoCSROpt(MF.getFunction()) &&
+      isProfitableForNoCSROpt(MF.getFunction()))
     return;
 
   // Get the callee saved register list...
@@ -118,6 +121,18 @@ unsigned TargetFrameLowering::getStackAlignmentSkew(
   return 0;
 }
 
+bool TargetFrameLowering::isSafeForNoCSROpt(const Function &F) {
+  if (!F.hasLocalLinkage() || F.hasAddressTaken() ||
+      !F.hasFnAttribute(Attribute::NoRecurse))
+    return false;
+  // Function should not be optimized as tail call.
+  for (const User *U : F.users())
+    if (auto CS = ImmutableCallSite(U))
+      if (CS.isTailCall())
+        return false;
+  return true;
+}
+
 int TargetFrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
   llvm_unreachable("getInitialCFAOffset() not implemented!");
 }
@@ -125,4 +140,4 @@ int TargetFrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
 unsigned TargetFrameLowering::getInitialCFARegister(const MachineFunction &MF)
     const {
   llvm_unreachable("getInitialCFARegister() not implemented!");
-}
-\ No newline at end of file
+}
diff --git a/lib/CodeGen/TargetInstrInfo.cpp b/lib/CodeGen/TargetInstrInfo.cpp
index 868617ffe14d..6cae3b869501 100644
--- a/lib/CodeGen/TargetInstrInfo.cpp
+++ b/lib/CodeGen/TargetInstrInfo.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/Support/CommandLine.h"
@@ -142,7 +143,7 @@ TargetInstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
   while (Tail != MBB->end()) {
     auto MI = Tail++;
     if (MI->isCall())
-      MBB->getParent()->updateCallSiteInfo(&*MI);
+      MBB->getParent()->eraseCallSiteInfo(&*MI);
     MBB->erase(MI);
   }
 
@@ -183,10 +184,10 @@ MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   bool Reg2IsInternal = MI.getOperand(Idx2).isInternalRead();
   // Avoid calling isRenamable for virtual registers since we assert that
   // renamable property is only queried/set for physical registers.
-  bool Reg1IsRenamable = TargetRegisterInfo::isPhysicalRegister(Reg1)
+  bool Reg1IsRenamable = Register::isPhysicalRegister(Reg1)
                              ? MI.getOperand(Idx1).isRenamable()
                              : false;
-  bool Reg2IsRenamable = TargetRegisterInfo::isPhysicalRegister(Reg2)
+  bool Reg2IsRenamable = Register::isPhysicalRegister(Reg2)
                              ? MI.getOperand(Idx2).isRenamable()
                              : false;
   // If destination is tied to either of the commuted source register, then
@@ -228,9 +229,9 @@ MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   CommutedMI->getOperand(Idx1).setIsInternalRead(Reg2IsInternal);
   // Avoid calling setIsRenamable for virtual registers since we assert that
   // renamable property is only queried/set for physical registers.
-  if (TargetRegisterInfo::isPhysicalRegister(Reg1))
+  if (Register::isPhysicalRegister(Reg1))
     CommutedMI->getOperand(Idx2).setIsRenamable(Reg1IsRenamable);
-  if (TargetRegisterInfo::isPhysicalRegister(Reg2))
+  if (Register::isPhysicalRegister(Reg2))
     CommutedMI->getOperand(Idx1).setIsRenamable(Reg2IsRenamable);
   return CommutedMI;
 }
@@ -281,7 +282,7 @@ bool TargetInstrInfo::fixCommutedOpIndices(unsigned &ResultIdx1,
   return true;
 }
 
-bool TargetInstrInfo::findCommutedOpIndices(MachineInstr &MI,
+bool TargetInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
                                             unsigned &SrcOpIdx1,
                                             unsigned &SrcOpIdx2) const {
   assert(!MI.isBundle() &&
@@ -393,7 +394,7 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
   if (BitOffset < 0 || BitOffset % 8)
     return false;
 
-  Size = BitSize /= 8;
+  Size = BitSize / 8;
   Offset = (unsigned)BitOffset / 8;
 
   assert(TRI->getSpillSize(*RC) >= (Offset + Size) && "bad subregister range");
@@ -442,16 +443,15 @@ static const TargetRegisterClass *canFoldCopy(const MachineInstr &MI,
   if (FoldOp.getSubReg() || LiveOp.getSubReg())
     return nullptr;
 
-  unsigned FoldReg = FoldOp.getReg();
-  unsigned LiveReg = LiveOp.getReg();
+  Register FoldReg = FoldOp.getReg();
+  Register LiveReg = LiveOp.getReg();
 
-  assert(TargetRegisterInfo::isVirtualRegister(FoldReg) &&
-         "Cannot fold physregs");
+  assert(Register::isVirtualRegister(FoldReg) && "Cannot fold physregs");
 
   const MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
   const TargetRegisterClass *RC = MRI.getRegClass(FoldReg);
 
-  if (TargetRegisterInfo::isPhysicalRegister(LiveOp.getReg()))
+  if (Register::isPhysicalRegister(LiveOp.getReg()))
     return RC->contains(LiveOp.getReg()) ? RC : nullptr;
 
   if (RC->hasSubClassEq(MRI.getRegClass(LiveReg)))
@@ -674,9 +674,9 @@ bool TargetInstrInfo::hasReassociableOperands(
   // reassociate.
   MachineInstr *MI1 = nullptr;
   MachineInstr *MI2 = nullptr;
-  if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg()))
+  if (Op1.isReg() && Register::isVirtualRegister(Op1.getReg()))
     MI1 = MRI.getUniqueVRegDef(Op1.getReg());
-  if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg()))
+  if (Op2.isReg() && Register::isVirtualRegister(Op2.getReg()))
     MI2 = MRI.getUniqueVRegDef(Op2.getReg());
 
   // And they need to be in the trace (otherwise, they won't have a depth).
@@ -805,27 +805,27 @@ void TargetInstrInfo::reassociateOps(
   MachineOperand &OpY = Root.getOperand(OpIdx[Row][3]);
   MachineOperand &OpC = Root.getOperand(0);
 
-  unsigned RegA = OpA.getReg();
-  unsigned RegB = OpB.getReg();
-  unsigned RegX = OpX.getReg();
-  unsigned RegY = OpY.getReg();
-  unsigned RegC = OpC.getReg();
+  Register RegA = OpA.getReg();
+  Register RegB = OpB.getReg();
+  Register RegX = OpX.getReg();
+  Register RegY = OpY.getReg();
+  Register RegC = OpC.getReg();
 
-  if (TargetRegisterInfo::isVirtualRegister(RegA))
+  if (Register::isVirtualRegister(RegA))
     MRI.constrainRegClass(RegA, RC);
-  if (TargetRegisterInfo::isVirtualRegister(RegB))
+  if (Register::isVirtualRegister(RegB))
     MRI.constrainRegClass(RegB, RC);
-  if (TargetRegisterInfo::isVirtualRegister(RegX))
+  if (Register::isVirtualRegister(RegX))
     MRI.constrainRegClass(RegX, RC);
-  if (TargetRegisterInfo::isVirtualRegister(RegY))
+  if (Register::isVirtualRegister(RegY))
     MRI.constrainRegClass(RegY, RC);
-  if (TargetRegisterInfo::isVirtualRegister(RegC))
+  if (Register::isVirtualRegister(RegC))
     MRI.constrainRegClass(RegC, RC);
 
   // Create a new virtual register for the result of (X op Y) instead of
   // recycling RegB because the MachineCombiner's computation of the critical
   // path requires a new register definition rather than an existing one.
-  unsigned NewVR = MRI.createVirtualRegister(RC);
+  Register NewVR = MRI.createVirtualRegister(RC);
   InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
 
   unsigned Opcode = Root.getOpcode();
@@ -880,21 +880,21 @@ void TargetInstrInfo::genAlternativeCodeSequence(
 }
 
 bool TargetInstrInfo::isReallyTriviallyReMaterializableGeneric(
-    const MachineInstr &MI, AliasAnalysis *AA) const {
+    const MachineInstr &MI, AAResults *AA) const {
   const MachineFunction &MF = *MI.getMF();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Remat clients assume operand 0 is the defined register.
   if (!MI.getNumOperands() || !MI.getOperand(0).isReg())
     return false;
-  unsigned DefReg = MI.getOperand(0).getReg();
+  Register DefReg = MI.getOperand(0).getReg();
 
   // A sub-register definition can only be rematerialized if the instruction
   // doesn't read the other parts of the register.  Otherwise it is really a
   // read-modify-write operation on the full virtual register which cannot be
   // moved safely.
-  if (TargetRegisterInfo::isVirtualRegister(DefReg) &&
-      MI.getOperand(0).getSubReg() && MI.readsVirtualRegister(DefReg))
+  if (Register::isVirtualRegister(DefReg) && MI.getOperand(0).getSubReg() &&
+      MI.readsVirtualRegister(DefReg))
     return false;
 
   // A load from a fixed stack slot can be rematerialized. This may be
@@ -924,12 +924,12 @@ bool TargetInstrInfo::isReallyTriviallyReMaterializableGeneric(
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI.getOperand(i);
     if (!MO.isReg()) continue;
-    unsigned Reg = MO.getReg();
+    Register Reg = MO.getReg();
     if (Reg == 0)
       continue;
 
     // Check for a well-behaved physical register.
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    if (Register::isPhysicalRegister(Reg)) {
       if (MO.isUse()) {
         // If the physreg has no defs anywhere, it's just an ambient register
         // and we can freely move its uses. Alternatively, if it's allocatable,
@@ -1120,6 +1120,24 @@ bool TargetInstrInfo::hasLowDefLatency(const TargetSchedModel &SchedModel,
   return (DefCycle != -1 && DefCycle <= 1);
 }
 
+Optional<ParamLoadedValue>
+TargetInstrInfo::describeLoadedValue(const MachineInstr &MI) const {
+  const MachineFunction *MF = MI.getMF();
+  const MachineOperand *Op = nullptr;
+  DIExpression *Expr = DIExpression::get(MF->getFunction().getContext(), {});;
+  const MachineOperand *SrcRegOp, *DestRegOp;
+
+  if (isCopyInstr(MI, SrcRegOp, DestRegOp)) {
+    Op = SrcRegOp;
+    return ParamLoadedValue(*Op, Expr);
+  } else if (MI.isMoveImmediate()) {
+    Op = &MI.getOperand(1);
+    return ParamLoadedValue(*Op, Expr);
+  }
+
+  return None;
+}
+
 /// Both DefMI and UseMI must be valid.  By default, call directly to the
 /// itinerary. This may be overriden by the target.
 int TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
@@ -1227,3 +1245,5 @@ bool TargetInstrInfo::getInsertSubregInputs(
   InsertedReg.SubIdx = (unsigned)MOSubIdx.getImm();
   return true;
 }
+
+TargetInstrInfo::PipelinerLoopInfo::~PipelinerLoopInfo() {}
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index 9b28c1a6c450..9b23012f47e3 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -167,6 +167,7 @@ void TargetLoweringBase::InitLibcalls(const Triple &TT) {
         setLibcallName(RTLIB::BZERO, "__bzero");
       break;
     case Triple::aarch64:
+    case Triple::aarch64_32:
       setLibcallName(RTLIB::BZERO, "bzero");
       break;
     default:
@@ -197,6 +198,11 @@ void TargetLoweringBase::InitLibcalls(const Triple &TT) {
     setLibcallName(RTLIB::SINCOS_PPCF128, "sincosl");
   }
 
+  if (TT.isPS4CPU()) {
+    setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+    setLibcallName(RTLIB::SINCOS_F64, "sincos");
+  }
+
   if (TT.isOSOpenBSD()) {
     setLibcallName(RTLIB::STACKPROTECTOR_CHECK_FAIL, nullptr);
   }
@@ -578,13 +584,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   BooleanFloatContents = UndefinedBooleanContent;
   BooleanVectorContents = UndefinedBooleanContent;
   SchedPreferenceInfo = Sched::ILP;
-  JumpBufSize = 0;
-  JumpBufAlignment = 0;
-  MinFunctionAlignment = 0;
-  PrefFunctionAlignment = 0;
-  PrefLoopAlignment = 0;
   GatherAllAliasesMaxDepth = 18;
-  MinStackArgumentAlignment = 1;
   // TODO: the default will be switched to 0 in the next commit, along
   // with the Target-specific changes necessary.
   MaxAtomicSizeInBitsSupported = 1024;
@@ -653,6 +653,7 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::SMULFIX, VT, Expand);
     setOperationAction(ISD::SMULFIXSAT, VT, Expand);
     setOperationAction(ISD::UMULFIX, VT, Expand);
+    setOperationAction(ISD::UMULFIXSAT, VT, Expand);
 
     // Overflow operations default to expand
     setOperationAction(ISD::SADDO, VT, Expand);
@@ -689,6 +690,7 @@ void TargetLoweringBase::initActions() {
       setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, VT, Expand);
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Expand);
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Expand);
+      setOperationAction(ISD::SPLAT_VECTOR, VT, Expand);
     }
 
     // Constrained floating-point operations default to expand.
@@ -708,16 +710,22 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::STRICT_FLOG, VT, Expand);
     setOperationAction(ISD::STRICT_FLOG10, VT, Expand);
     setOperationAction(ISD::STRICT_FLOG2, VT, Expand);
+    setOperationAction(ISD::STRICT_LRINT, VT, Expand);
+    setOperationAction(ISD::STRICT_LLRINT, VT, Expand);
     setOperationAction(ISD::STRICT_FRINT, VT, Expand);
     setOperationAction(ISD::STRICT_FNEARBYINT, VT, Expand);
     setOperationAction(ISD::STRICT_FCEIL, VT, Expand);
     setOperationAction(ISD::STRICT_FFLOOR, VT, Expand);
+    setOperationAction(ISD::STRICT_LROUND, VT, Expand);
+    setOperationAction(ISD::STRICT_LLROUND, VT, Expand);
     setOperationAction(ISD::STRICT_FROUND, VT, Expand);
     setOperationAction(ISD::STRICT_FTRUNC, VT, Expand);
     setOperationAction(ISD::STRICT_FMAXNUM, VT, Expand);
     setOperationAction(ISD::STRICT_FMINNUM, VT, Expand);
     setOperationAction(ISD::STRICT_FP_ROUND, VT, Expand);
     setOperationAction(ISD::STRICT_FP_EXTEND, VT, Expand);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Expand);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Expand);
 
     // For most targets @llvm.get.dynamic.area.offset just returns 0.
     setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, VT, Expand);
@@ -824,7 +832,8 @@ TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
     LegalizeTypeAction LA = ValueTypeActions.getTypeAction(SVT);
 
     assert((LA == TypeLegal || LA == TypeSoftenFloat ||
-            ValueTypeActions.getTypeAction(NVT) != TypePromoteInteger) &&
+            (NVT.isVector() ||
+             ValueTypeActions.getTypeAction(NVT) != TypePromoteInteger)) &&
            "Promote may not follow Expand or Promote");
 
     if (LA == TypeSplitVector)
@@ -1257,17 +1266,23 @@ void TargetLoweringBase::computeRegisterProperties(
     MVT EltVT = VT.getVectorElementType();
     unsigned NElts = VT.getVectorNumElements();
     bool IsLegalWiderType = false;
+    bool IsScalable = VT.isScalableVector();
     LegalizeTypeAction PreferredAction = getPreferredVectorAction(VT);
     switch (PreferredAction) {
-    case TypePromoteInteger:
+    case TypePromoteInteger: {
+      MVT::SimpleValueType EndVT = IsScalable ?
+                                   MVT::LAST_INTEGER_SCALABLE_VECTOR_VALUETYPE :
+                                   MVT::LAST_INTEGER_FIXEDLEN_VECTOR_VALUETYPE;
       // Try to promote the elements of integer vectors. If no legal
       // promotion was found, fall through to the widen-vector method.
-      for (unsigned nVT = i + 1; nVT <= MVT::LAST_INTEGER_VECTOR_VALUETYPE; ++nVT) {
+      for (unsigned nVT = i + 1;
+           (MVT::SimpleValueType)nVT <= EndVT; ++nVT) {
         MVT SVT = (MVT::SimpleValueType) nVT;
         // Promote vectors of integers to vectors with the same number
         // of elements, with a wider element type.
         if (SVT.getScalarSizeInBits() > EltVT.getSizeInBits() &&
-            SVT.getVectorNumElements() == NElts && isTypeLegal(SVT)) {
+            SVT.getVectorNumElements() == NElts &&
+            SVT.isScalableVector() == IsScalable && isTypeLegal(SVT)) {
           TransformToType[i] = SVT;
           RegisterTypeForVT[i] = SVT;
           NumRegistersForVT[i] = 1;
@@ -1279,23 +1294,37 @@ void TargetLoweringBase::computeRegisterProperties(
       if (IsLegalWiderType)
         break;
       LLVM_FALLTHROUGH;
+    }
 
     case TypeWidenVector:
-      // Try to widen the vector.
-      for (unsigned nVT = i + 1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
-        MVT SVT = (MVT::SimpleValueType) nVT;
-        if (SVT.getVectorElementType() == EltVT
-            && SVT.getVectorNumElements() > NElts && isTypeLegal(SVT)) {
-          TransformToType[i] = SVT;
-          RegisterTypeForVT[i] = SVT;
-          NumRegistersForVT[i] = 1;
+      if (isPowerOf2_32(NElts)) {
+        // Try to widen the vector.
+        for (unsigned nVT = i + 1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+          MVT SVT = (MVT::SimpleValueType) nVT;
+          if (SVT.getVectorElementType() == EltVT
+              && SVT.getVectorNumElements() > NElts
+              && SVT.isScalableVector() == IsScalable && isTypeLegal(SVT)) {
+            TransformToType[i] = SVT;
+            RegisterTypeForVT[i] = SVT;
+            NumRegistersForVT[i] = 1;
+            ValueTypeActions.setTypeAction(VT, TypeWidenVector);
+            IsLegalWiderType = true;
+            break;
+          }
+        }
+        if (IsLegalWiderType)
+          break;
+      } else {
+        // Only widen to the next power of 2 to keep consistency with EVT.
+        MVT NVT = VT.getPow2VectorType();
+        if (isTypeLegal(NVT)) {
+          TransformToType[i] = NVT;
           ValueTypeActions.setTypeAction(VT, TypeWidenVector);
-          IsLegalWiderType = true;
+          RegisterTypeForVT[i] = NVT;
+          NumRegistersForVT[i] = 1;
           break;
         }
       }
-      if (IsLegalWiderType)
-        break;
       LLVM_FALLTHROUGH;
 
     case TypeSplitVector:
@@ -1488,12 +1517,9 @@ unsigned TargetLoweringBase::getByValTypeAlignment(Type *Ty,
   return DL.getABITypeAlignment(Ty);
 }
 
-bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
-                                            const DataLayout &DL, EVT VT,
-                                            unsigned AddrSpace,
-                                            unsigned Alignment,
-                                            MachineMemOperand::Flags Flags,
-                                            bool *Fast) const {
+bool TargetLoweringBase::allowsMemoryAccessForAlignment(
+    LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace,
+    unsigned Alignment, MachineMemOperand::Flags Flags, bool *Fast) const {
   // Check if the specified alignment is sufficient based on the data layout.
   // TODO: While using the data layout works in practice, a better solution
   // would be to implement this check directly (make this a virtual function).
@@ -1511,6 +1537,21 @@ bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
   return allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags, Fast);
 }
 
+bool TargetLoweringBase::allowsMemoryAccessForAlignment(
+    LLVMContext &Context, const DataLayout &DL, EVT VT,
+    const MachineMemOperand &MMO, bool *Fast) const {
+  return allowsMemoryAccessForAlignment(Context, DL, VT, MMO.getAddrSpace(),
+                                        MMO.getAlignment(), MMO.getFlags(),
+                                        Fast);
+}
+
+bool TargetLoweringBase::allowsMemoryAccess(
+    LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace,
+    unsigned Alignment, MachineMemOperand::Flags Flags, bool *Fast) const {
+  return allowsMemoryAccessForAlignment(Context, DL, VT, AddrSpace, Alignment,
+                                        Flags, Fast);
+}
+
 bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
                                             const DataLayout &DL, EVT VT,
                                             const MachineMemOperand &MMO,
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 4c8f75b237aa..4978f4b9500b 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -43,6 +43,7 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCSectionWasm.h"
+#include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
@@ -154,6 +155,7 @@ void TargetLoweringObjectFileELF::Initialize(MCContext &Ctx,
     break;
   case Triple::aarch64:
   case Triple::aarch64_be:
+  case Triple::aarch64_32:
     // The small model guarantees static code/data size < 4GB, but not where it
     // will be in memory. Most of these could end up >2GB away so even a signed
     // pc-relative 32-bit address is insufficient, theoretically.
@@ -375,7 +377,7 @@ void TargetLoweringObjectFileELF::emitPersonalityValue(
                                                    ELF::SHT_PROGBITS, Flags, 0);
   unsigned Size = DL.getPointerSize();
   Streamer.SwitchSection(Sec);
-  Streamer.EmitValueToAlignment(DL.getPointerABIAlignment(0));
+  Streamer.EmitValueToAlignment(DL.getPointerABIAlignment(0).value());
   Streamer.EmitSymbolAttribute(Label, MCSA_ELF_TypeObject);
   const MCExpr *E = MCConstantExpr::create(Size, getContext());
   Streamer.emitELFSize(Label, E);
@@ -524,8 +526,8 @@ static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO,
   if (!VM)
     report_fatal_error("MD_associated operand is not ValueAsMetadata");
 
-  GlobalObject *OtherGO = dyn_cast<GlobalObject>(VM->getValue());
-  return OtherGO ? dyn_cast<MCSymbolELF>(TM.getSymbol(OtherGO)) : nullptr;
+  auto *OtherGV = dyn_cast<GlobalValue>(VM->getValue());
+  return OtherGV ? dyn_cast<MCSymbolELF>(TM.getSymbol(OtherGV)) : nullptr;
 }
 
 static unsigned getEntrySizeForKind(SectionKind Kind) {
@@ -566,6 +568,8 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
       SectionName = Attrs.getAttribute("bss-section").getValueAsString();
     } else if (Attrs.hasAttribute("rodata-section") && Kind.isReadOnly()) {
       SectionName = Attrs.getAttribute("rodata-section").getValueAsString();
+    } else if (Attrs.hasAttribute("relro-section") && Kind.isReadOnlyWithRel()) {
+      SectionName = Attrs.getAttribute("relro-section").getValueAsString();
     } else if (Attrs.hasAttribute("data-section") && Kind.isData()) {
       SectionName = Attrs.getAttribute("data-section").getValueAsString();
     }
@@ -1107,8 +1111,8 @@ MCSymbol *TargetLoweringObjectFileMachO::getCFIPersonalitySymbol(
 }
 
 const MCExpr *TargetLoweringObjectFileMachO::getIndirectSymViaGOTPCRel(
-    const MCSymbol *Sym, const MCValue &MV, int64_t Offset,
-    MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+    const GlobalValue *GV, const MCSymbol *Sym, const MCValue &MV,
+    int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const {
   // Although MachO 32-bit targets do not explicitly have a GOTPCREL relocation
   // as 64-bit do, we replace the GOT equivalent by accessing the final symbol
   // through a non_lazy_ptr stub instead. One advantage is that it allows the
@@ -1165,12 +1169,10 @@ const MCExpr *TargetLoweringObjectFileMachO::getIndirectSymViaGOTPCRel(
   MCSymbol *Stub = Ctx.getOrCreateSymbol(Name);
 
   MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(Stub);
-  if (!StubSym.getPointer()) {
-    bool IsIndirectLocal = Sym->isDefined() && !Sym->isExternal();
-    // With the assumption that IsIndirectLocal == GV->hasLocalLinkage().
+
+  if (!StubSym.getPointer())
     StubSym = MachineModuleInfoImpl::StubValueTy(const_cast<MCSymbol *>(Sym),
-                                                 !IsIndirectLocal);
-  }
+                                                 !GV->hasLocalLinkage());
 
   const MCExpr *BSymExpr =
     MCSymbolRefExpr::create(BaseSym, MCSymbolRefExpr::VK_None, Ctx);
@@ -1519,7 +1521,8 @@ static MCSectionCOFF *getCOFFStaticStructorSection(MCContext &Ctx,
     // internally, so we use ".CRT$XCA00001" for them.
     SmallString<24> Name;
     raw_svector_ostream OS(Name);
-    OS << ".CRT$XC" << (Priority < 200 ? 'A' : 'T') << format("%05u", Priority);
+    OS << ".CRT$X" << (IsCtor ? "C" : "T") <<
+        (Priority < 200 ? 'A' : 'T') << format("%05u", Priority);
     MCSectionCOFF *Sec = Ctx.getCOFFSection(
         Name, COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ,
         SectionKind::getReadOnly());
@@ -1595,7 +1598,8 @@ const MCExpr *TargetLoweringObjectFileCOFF::lowerRelativeReference(
 
 static std::string APIntToHexString(const APInt &AI) {
   unsigned Width = (AI.getBitWidth() / 8) * 2;
-  std::string HexString = utohexstr(AI.getLimitedValue(), /*LowerCase=*/true);
+  std::string HexString = AI.toString(16, /*Signed=*/false);
+  transform(HexString.begin(), HexString.end(), HexString.begin(), tolower);
   unsigned Size = HexString.size();
   assert(Width >= Size && "hex string is too large!");
   HexString.insert(HexString.begin(), Width - Size, '0');
@@ -1819,3 +1823,82 @@ MCSection *TargetLoweringObjectFileWasm::getStaticDtorSection(
   llvm_unreachable("@llvm.global_dtors should have been lowered already");
   return nullptr;
 }
+
+//===----------------------------------------------------------------------===//
+//                                  XCOFF
+//===----------------------------------------------------------------------===//
+MCSection *TargetLoweringObjectFileXCOFF::getExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  report_fatal_error("XCOFF explicit sections not yet implemented.");
+}
+
+MCSection *TargetLoweringObjectFileXCOFF::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  assert(!TM.getFunctionSections() && !TM.getDataSections() &&
+         "XCOFF unique sections not yet implemented.");
+
+  // Common symbols go into a csect with matching name which will get mapped
+  // into the .bss section.
+  if (Kind.isBSSLocal() || Kind.isCommon()) {
+    SmallString<128> Name;
+    getNameWithPrefix(Name, GO, TM);
+    XCOFF::StorageClass SC =
+        TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GO);
+    return getContext().getXCOFFSection(
+        Name, Kind.isBSSLocal() ? XCOFF::XMC_BS : XCOFF::XMC_RW, XCOFF::XTY_CM,
+        SC, Kind, /* BeginSymbolName */ nullptr);
+  }
+
+  if (Kind.isText())
+    return TextSection;
+
+  if (Kind.isData())
+    return DataSection;
+
+  report_fatal_error("XCOFF other section types not yet implemented.");
+}
+
+bool TargetLoweringObjectFileXCOFF::shouldPutJumpTableInFunctionSection(
+    bool UsesLabelDifference, const Function &F) const {
+  report_fatal_error("TLOF XCOFF not yet implemented.");
+}
+
+void TargetLoweringObjectFileXCOFF::Initialize(MCContext &Ctx,
+                                               const TargetMachine &TgtM) {
+  TargetLoweringObjectFile::Initialize(Ctx, TgtM);
+  TTypeEncoding = 0;
+  PersonalityEncoding = 0;
+  LSDAEncoding = 0;
+}
+
+MCSection *TargetLoweringObjectFileXCOFF::getStaticCtorSection(
+    unsigned Priority, const MCSymbol *KeySym) const {
+  report_fatal_error("XCOFF ctor section not yet implemented.");
+}
+
+MCSection *TargetLoweringObjectFileXCOFF::getStaticDtorSection(
+    unsigned Priority, const MCSymbol *KeySym) const {
+  report_fatal_error("XCOFF dtor section not yet implemented.");
+}
+
+const MCExpr *TargetLoweringObjectFileXCOFF::lowerRelativeReference(
+    const GlobalValue *LHS, const GlobalValue *RHS,
+    const TargetMachine &TM) const {
+  report_fatal_error("XCOFF not yet implemented.");
+}
+
+XCOFF::StorageClass TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(
+    const GlobalObject *GO) {
+  switch (GO->getLinkage()) {
+  case GlobalValue::InternalLinkage:
+    return XCOFF::C_HIDEXT;
+  case GlobalValue::ExternalLinkage:
+  case GlobalValue::CommonLinkage:
+    return XCOFF::C_EXT;
+  case GlobalValue::ExternalWeakLinkage:
+    return XCOFF::C_WEAKEXT;
+  default:
+    report_fatal_error(
+        "Unhandled linkage when mapping linkage to StorageClass.");
+  }
+}
diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp
index 36df02692f86..f1f4f65adf7c 100644
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -49,9 +49,10 @@
 
 using namespace llvm;
 
-cl::opt<bool> EnableIPRA("enable-ipra", cl::init(false), cl::Hidden,
-                         cl::desc("Enable interprocedural register allocation "
-                                  "to reduce load/store at procedure calls."));
+static cl::opt<bool>
+    EnableIPRA("enable-ipra", cl::init(false), cl::Hidden,
+               cl::desc("Enable interprocedural register allocation "
+                        "to reduce load/store at procedure calls."));
 static cl::opt<bool> DisablePostRASched("disable-post-ra", cl::Hidden,
     cl::desc("Disable Post Regalloc Scheduler"));
 static cl::opt<bool> DisableBranchFold("disable-branch-fold", cl::Hidden,
@@ -152,8 +153,10 @@ static cl::opt<GlobalISelAbortMode> EnableGlobalISelAbort(
 // substitutePass(&PostRASchedulerID, &PostMachineSchedulerID).
 // Targets can return true in targetSchedulesPostRAScheduling() and
 // insert a PostRA scheduling pass wherever it wants.
-cl::opt<bool> MISchedPostRA("misched-postra", cl::Hidden,
-  cl::desc("Run MachineScheduler post regalloc (independent of preRA sched)"));
+static cl::opt<bool> MISchedPostRA(
+    "misched-postra", cl::Hidden,
+    cl::desc(
+        "Run MachineScheduler post regalloc (independent of preRA sched)"));
 
 // Experimental option to run live interval analysis early.
 static cl::opt<bool> EarlyLiveIntervals("early-live-intervals", cl::Hidden,
@@ -175,10 +178,10 @@ static cl::opt<CFLAAType> UseCFLAA(
 /// Option names for limiting the codegen pipeline.
 /// Those are used in error reporting and we didn't want
 /// to duplicate their names all over the place.
-const char *StartAfterOptName = "start-after";
-const char *StartBeforeOptName = "start-before";
-const char *StopAfterOptName = "stop-after";
-const char *StopBeforeOptName = "stop-before";
+static const char *StartAfterOptName = "start-after";
+static const char *StartBeforeOptName = "start-before";
+static const char *StopAfterOptName = "stop-after";
+static const char *StopBeforeOptName = "stop-before";
 
 static cl::opt<std::string>
     StartAfterOpt(StringRef(StartAfterOptName),
@@ -654,6 +657,7 @@ void TargetPassConfig::addIRPasses() {
   // TODO: add a pass insertion point here
   addPass(createGCLoweringPass());
   addPass(createShadowStackGCLoweringPass());
+  addPass(createLowerConstantIntrinsicsPass());
 
   // Make sure that no unreachable blocks are instruction selected.
   addPass(createUnreachableBlockEliminationPass());
@@ -1231,5 +1235,5 @@ bool TargetPassConfig::isGISelCSEEnabled() const {
 }
 
 std::unique_ptr<CSEConfigBase> TargetPassConfig::getCSEConfig() const {
-  return make_unique<CSEConfigBase>();
+  return std::make_unique<CSEConfigBase>();
 }
diff --git a/lib/CodeGen/TargetRegisterInfo.cpp b/lib/CodeGen/TargetRegisterInfo.cpp
index f1b2ecf3243b..e5592c31098a 100644
--- a/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/lib/CodeGen/TargetRegisterInfo.cpp
@@ -86,22 +86,21 @@ bool TargetRegisterInfo::checkAllSuperRegsMarked(const BitVector &RegisterSet,
 
 namespace llvm {
 
-Printable printReg(unsigned Reg, const TargetRegisterInfo *TRI,
+Printable printReg(Register Reg, const TargetRegisterInfo *TRI,
                    unsigned SubIdx, const MachineRegisterInfo *MRI) {
   return Printable([Reg, TRI, SubIdx, MRI](raw_ostream &OS) {
     if (!Reg)
       OS << "$noreg";
-    else if (TargetRegisterInfo::isStackSlot(Reg))
-      OS << "SS#" << TargetRegisterInfo::stackSlot2Index(Reg);
-    else if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    else if (Register::isStackSlot(Reg))
+      OS << "SS#" << Register::stackSlot2Index(Reg);
+    else if (Register::isVirtualRegister(Reg)) {
       StringRef Name = MRI ? MRI->getVRegName(Reg) : "";
       if (Name != "") {
         OS << '%' << Name;
       } else {
-        OS << '%' << TargetRegisterInfo::virtReg2Index(Reg);
+        OS << '%' << Register::virtReg2Index(Reg);
       }
-    }
-    else if (!TRI)
+    } else if (!TRI)
       OS << '$' << "physreg" << Reg;
     else if (Reg < TRI->getNumRegs()) {
       OS << '$';
@@ -143,8 +142,8 @@ Printable printRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
 
 Printable printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
   return Printable([Unit, TRI](raw_ostream &OS) {
-    if (TRI && TRI->isVirtualRegister(Unit)) {
-      OS << '%' << TargetRegisterInfo::virtReg2Index(Unit);
+    if (Register::isVirtualRegister(Unit)) {
+      OS << '%' << Register::virtReg2Index(Unit);
     } else {
       OS << printRegUnit(Unit, TRI);
     }
@@ -189,7 +188,8 @@ TargetRegisterInfo::getAllocatableClass(const TargetRegisterClass *RC) const {
 /// the right type that contains this physreg.
 const TargetRegisterClass *
 TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, MVT VT) const {
-  assert(isPhysicalRegister(reg) && "reg must be a physical register");
+  assert(Register::isPhysicalRegister(reg) &&
+         "reg must be a physical register");
 
   // Pick the most sub register class of the right type that contains
   // this physreg.
@@ -238,24 +238,16 @@ BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
 static inline
 const TargetRegisterClass *firstCommonClass(const uint32_t *A,
                                             const uint32_t *B,
-                                            const TargetRegisterInfo *TRI,
-                                            const MVT::SimpleValueType SVT =
-                                            MVT::SimpleValueType::Any) {
-  const MVT VT(SVT);
+                                            const TargetRegisterInfo *TRI) {
   for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; I += 32)
-    if (unsigned Common = *A++ & *B++) {
-      const TargetRegisterClass *RC =
-          TRI->getRegClass(I + countTrailingZeros(Common));
-      if (SVT == MVT::SimpleValueType::Any || TRI->isTypeLegalForClass(*RC, VT))
-        return RC;
-    }
+    if (unsigned Common = *A++ & *B++)
+      return TRI->getRegClass(I + countTrailingZeros(Common));
   return nullptr;
 }
 
 const TargetRegisterClass *
 TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
-                                      const TargetRegisterClass *B,
-                                      const MVT::SimpleValueType SVT) const {
+                                      const TargetRegisterClass *B) const {
   // First take care of the trivial cases.
   if (A == B)
     return A;
@@ -264,7 +256,7 @@ TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
 
   // Register classes are ordered topologically, so the largest common
   // sub-class it the common sub-class with the smallest ID.
-  return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this, SVT);
+  return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this);
 }
 
 const TargetRegisterClass *
@@ -409,7 +401,7 @@ TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg,
 
     // Target-independent hints are either a physical or a virtual register.
     unsigned Phys = Reg;
-    if (VRM && isVirtualRegister(Phys))
+    if (VRM && Register::isVirtualRegister(Phys))
       Phys = VRM->getPhys(Phys);
 
     // Don't add the same reg twice (Hints_MRI may contain multiple virtual
@@ -417,7 +409,7 @@ TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg,
     if (!HintedRegs.insert(Phys).second)
       continue;
     // Check that Phys is a valid hint in VirtReg's register class.
-    if (!isPhysicalRegister(Phys))
+    if (!Register::isPhysicalRegister(Phys))
       continue;
     if (MRI.isReserved(Phys))
       continue;
@@ -433,6 +425,20 @@ TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg,
   return false;
 }
 
+bool TargetRegisterInfo::isCalleeSavedPhysReg(
+    unsigned PhysReg, const MachineFunction &MF) const {
+  if (PhysReg == 0)
+    return false;
+  const uint32_t *callerPreservedRegs =
+      getCallPreservedMask(MF, MF.getFunction().getCallingConv());
+  if (callerPreservedRegs) {
+    assert(Register::isPhysicalRegister(PhysReg) &&
+           "Expected physical register");
+    return (callerPreservedRegs[PhysReg / 32] >> PhysReg % 32) & 1;
+  }
+  return false;
+}
+
 bool TargetRegisterInfo::canRealignStack(const MachineFunction &MF) const {
   return !MF.getFunction().hasFnAttribute("no-realign-stack");
 }
@@ -466,7 +472,7 @@ bool TargetRegisterInfo::regmaskSubsetEqual(const uint32_t *mask0,
 unsigned TargetRegisterInfo::getRegSizeInBits(unsigned Reg,
                                          const MachineRegisterInfo &MRI) const {
   const TargetRegisterClass *RC{};
-  if (isPhysicalRegister(Reg)) {
+  if (Register::isPhysicalRegister(Reg)) {
     // The size is not directly available for physical registers.
     // Instead, we need to access a register class that contains Reg and
     // get the size of that register class.
@@ -501,7 +507,7 @@ TargetRegisterInfo::lookThruCopyLike(unsigned SrcReg,
       CopySrcReg = MI->getOperand(2).getReg();
     }
 
-    if (!isVirtualRegister(CopySrcReg))
+    if (!Register::isVirtualRegister(CopySrcReg))
       return CopySrcReg;
 
     SrcReg = CopySrcReg;
diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp
index 195279719ad4..ce59452fd1b8 100644
--- a/lib/CodeGen/TargetSchedule.cpp
+++ b/lib/CodeGen/TargetSchedule.cpp
@@ -300,7 +300,7 @@ computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx,
   // TODO: The following hack exists because predication passes do not
   // correctly append imp-use operands, and readsReg() strangely returns false
   // for predicated defs.
-  unsigned Reg = DefMI->getOperand(DefOperIdx).getReg();
+  Register Reg = DefMI->getOperand(DefOperIdx).getReg();
   const MachineFunction &MF = *DefMI->getMF();
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   if (!DepMI->readsRegister(Reg, TRI) && TII->isPredicated(*DepMI))
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 43d876646967..ea971809d4e4 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -230,7 +230,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
   for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg())
       continue;
-    unsigned MOReg = MO.getReg();
+    Register MOReg = MO.getReg();
     if (!MOReg)
       continue;
     if (MO.isUse() && MOReg != SavedReg)
@@ -299,7 +299,7 @@ sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
       MachineOperand &MO = OtherMI.getOperand(i);
       if (!MO.isReg())
         continue;
-      unsigned MOReg = MO.getReg();
+      Register MOReg = MO.getReg();
       if (!MOReg)
         continue;
       if (DefReg == MOReg)
@@ -418,8 +418,8 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
   } else
     return false;
 
-  IsSrcPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
-  IsDstPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+  IsSrcPhys = Register::isPhysicalRegister(SrcReg);
+  IsDstPhys = Register::isPhysicalRegister(DstReg);
   return true;
 }
 
@@ -427,8 +427,7 @@ static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
 /// given instruction, is killed by the given instruction.
 static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg,
                             LiveIntervals *LIS) {
-  if (LIS && TargetRegisterInfo::isVirtualRegister(Reg) &&
-      !LIS->isNotInMIMap(*MI)) {
+  if (LIS && Register::isVirtualRegister(Reg) && !LIS->isNotInMIMap(*MI)) {
     // FIXME: Sometimes tryInstructionTransform() will add instructions and
     // test whether they can be folded before keeping them. In this case it
     // sets a kill before recursively calling tryInstructionTransform() again.
@@ -475,12 +474,12 @@ static bool isKilled(MachineInstr &MI, unsigned Reg,
   MachineInstr *DefMI = &MI;
   while (true) {
     // All uses of physical registers are likely to be kills.
-    if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
+    if (Register::isPhysicalRegister(Reg) &&
         (allowFalsePositives || MRI->hasOneUse(Reg)))
       return true;
     if (!isPlainlyKilled(DefMI, Reg, LIS))
       return false;
-    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    if (Register::isPhysicalRegister(Reg))
       return true;
     MachineRegisterInfo::def_iterator Begin = MRI->def_begin(Reg);
     // If there are multiple defs, we can't do a simple analysis, so just
@@ -536,7 +535,7 @@ MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
   }
   IsDstPhys = false;
   if (isTwoAddrUse(UseMI, Reg, DstReg)) {
-    IsDstPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+    IsDstPhys = Register::isPhysicalRegister(DstReg);
     return &UseMI;
   }
   return nullptr;
@@ -546,13 +545,13 @@ MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
 /// to.
 static unsigned
 getMappedReg(unsigned Reg, DenseMap<unsigned, unsigned> &RegMap) {
-  while (TargetRegisterInfo::isVirtualRegister(Reg))  {
+  while (Register::isVirtualRegister(Reg)) {
     DenseMap<unsigned, unsigned>::iterator SI = RegMap.find(Reg);
     if (SI == RegMap.end())
       return 0;
     Reg = SI->second;
   }
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+  if (Register::isPhysicalRegister(Reg))
     return Reg;
   return 0;
 }
@@ -683,7 +682,7 @@ bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI,
                                                    unsigned RegBIdx,
                                                    unsigned RegCIdx,
                                                    unsigned Dist) {
-  unsigned RegC = MI->getOperand(RegCIdx).getReg();
+  Register RegC = MI->getOperand(RegCIdx).getReg();
   LLVM_DEBUG(dbgs() << "2addr: COMMUTING  : " << *MI);
   MachineInstr *NewMI = TII->commuteInstruction(*MI, false, RegBIdx, RegCIdx);
 
@@ -700,7 +699,7 @@ bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI,
   // Update source register map.
   unsigned FromRegC = getMappedReg(RegC, SrcRegMap);
   if (FromRegC) {
-    unsigned RegA = MI->getOperand(DstIdx).getReg();
+    Register RegA = MI->getOperand(DstIdx).getReg();
     SrcRegMap[RegA] = FromRegC;
   }
 
@@ -911,7 +910,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
   for (const MachineOperand &MO : MI->operands()) {
     if (!MO.isReg())
       continue;
-    unsigned MOReg = MO.getReg();
+    Register MOReg = MO.getReg();
     if (!MOReg)
       continue;
     if (MO.isDef())
@@ -955,7 +954,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
     for (const MachineOperand &MO : OtherMI.operands()) {
       if (!MO.isReg())
         continue;
-      unsigned MOReg = MO.getReg();
+      Register MOReg = MO.getReg();
       if (!MOReg)
         continue;
       if (MO.isDef()) {
@@ -1093,7 +1092,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
   for (const MachineOperand &MO : KillMI->operands()) {
     if (!MO.isReg())
       continue;
-    unsigned MOReg = MO.getReg();
+    Register MOReg = MO.getReg();
     if (MO.isUse()) {
       if (!MOReg)
         continue;
@@ -1105,7 +1104,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
       Uses.insert(MOReg);
       if (isKill && MOReg != Reg)
         Kills.insert(MOReg);
-    } else if (TargetRegisterInfo::isPhysicalRegister(MOReg)) {
+    } else if (Register::isPhysicalRegister(MOReg)) {
       Defs.insert(MOReg);
       if (!MO.isDead())
         LiveDefs.insert(MOReg);
@@ -1130,7 +1129,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
     for (const MachineOperand &MO : OtherMI.operands()) {
       if (!MO.isReg())
         continue;
-      unsigned MOReg = MO.getReg();
+      Register MOReg = MO.getReg();
       if (!MOReg)
         continue;
       if (MO.isUse()) {
@@ -1154,8 +1153,7 @@ rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
       unsigned MOReg = OtherDefs[i];
       if (Uses.count(MOReg))
         return false;
-      if (TargetRegisterInfo::isPhysicalRegister(MOReg) &&
-          LiveDefs.count(MOReg))
+      if (Register::isPhysicalRegister(MOReg) && LiveDefs.count(MOReg))
         return false;
       // Physical register def is seen.
       Defs.erase(MOReg);
@@ -1208,8 +1206,8 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
     return false;
 
   bool MadeChange = false;
-  unsigned DstOpReg = MI->getOperand(DstOpIdx).getReg();
-  unsigned BaseOpReg = MI->getOperand(BaseOpIdx).getReg();
+  Register DstOpReg = MI->getOperand(DstOpIdx).getReg();
+  Register BaseOpReg = MI->getOperand(BaseOpIdx).getReg();
   unsigned OpsNum = MI->getDesc().getNumOperands();
   unsigned OtherOpIdx = MI->getDesc().getNumDefs();
   for (; OtherOpIdx < OpsNum; OtherOpIdx++) {
@@ -1221,7 +1219,7 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
         !TII->findCommutedOpIndices(*MI, BaseOpIdx, OtherOpIdx))
       continue;
 
-    unsigned OtherOpReg = MI->getOperand(OtherOpIdx).getReg();
+    Register OtherOpReg = MI->getOperand(OtherOpIdx).getReg();
     bool AggressiveCommute = false;
 
     // If OtherOp dies but BaseOp does not, swap the OtherOp and BaseOp
@@ -1276,14 +1274,14 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
     return false;
 
   MachineInstr &MI = *mi;
-  unsigned regA = MI.getOperand(DstIdx).getReg();
-  unsigned regB = MI.getOperand(SrcIdx).getReg();
+  Register regA = MI.getOperand(DstIdx).getReg();
+  Register regB = MI.getOperand(SrcIdx).getReg();
 
-  assert(TargetRegisterInfo::isVirtualRegister(regB) &&
+  assert(Register::isVirtualRegister(regB) &&
          "cannot make instruction into two-address form");
   bool regBKilled = isKilled(MI, regB, MRI, TII, LIS, true);
 
-  if (TargetRegisterInfo::isVirtualRegister(regA))
+  if (Register::isVirtualRegister(regA))
     scanUses(regA);
 
   bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
@@ -1363,7 +1361,7 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
         const TargetRegisterClass *RC =
           TRI->getAllocatableClass(
             TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI, *MF));
-        unsigned Reg = MRI->createVirtualRegister(RC);
+        Register Reg = MRI->createVirtualRegister(RC);
         SmallVector<MachineInstr *, 2> NewMIs;
         if (!TII->unfoldMemoryOperand(*MF, MI, Reg,
                                       /*UnfoldLoad=*/true,
@@ -1399,8 +1397,7 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
           if (LV) {
             for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
               MachineOperand &MO = MI.getOperand(i);
-              if (MO.isReg() &&
-                  TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+              if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) {
                 if (MO.isUse()) {
                   if (MO.isKill()) {
                     if (NewMIs[0]->killsRegister(MO.getReg()))
@@ -1474,8 +1471,8 @@ collectTiedOperands(MachineInstr *MI, TiedOperandMap &TiedOperands) {
     AnyOps = true;
     MachineOperand &SrcMO = MI->getOperand(SrcIdx);
     MachineOperand &DstMO = MI->getOperand(DstIdx);
-    unsigned SrcReg = SrcMO.getReg();
-    unsigned DstReg = DstMO.getReg();
+    Register SrcReg = SrcMO.getReg();
+    Register DstReg = DstMO.getReg();
     // Tied constraint already satisfied?
     if (SrcReg == DstReg)
       continue;
@@ -1485,7 +1482,7 @@ collectTiedOperands(MachineInstr *MI, TiedOperandMap &TiedOperands) {
     // Deal with undef uses immediately - simply rewrite the src operand.
     if (SrcMO.isUndef() && !DstMO.getSubReg()) {
       // Constrain the DstReg register class if required.
-      if (TargetRegisterInfo::isVirtualRegister(DstReg))
+      if (Register::isVirtualRegister(DstReg))
         if (const TargetRegisterClass *RC = TII->getRegClass(MCID, SrcIdx,
                                                              TRI, *MF))
           MRI->constrainRegClass(DstReg, RC);
@@ -1522,7 +1519,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     unsigned DstIdx = TiedPairs[tpi].second;
 
     const MachineOperand &DstMO = MI->getOperand(DstIdx);
-    unsigned RegA = DstMO.getReg();
+    Register RegA = DstMO.getReg();
 
     // Grab RegB from the instruction because it may have changed if the
     // instruction was commuted.
@@ -1538,7 +1535,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     }
     LastCopiedReg = RegA;
 
-    assert(TargetRegisterInfo::isVirtualRegister(RegB) &&
+    assert(Register::isVirtualRegister(RegB) &&
            "cannot make instruction into two-address form");
 
 #ifndef NDEBUG
@@ -1559,14 +1556,13 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     MIB.addReg(RegB, 0, SubRegB);
     const TargetRegisterClass *RC = MRI->getRegClass(RegB);
     if (SubRegB) {
-      if (TargetRegisterInfo::isVirtualRegister(RegA)) {
+      if (Register::isVirtualRegister(RegA)) {
         assert(TRI->getMatchingSuperRegClass(RC, MRI->getRegClass(RegA),
                                              SubRegB) &&
                "tied subregister must be a truncation");
         // The superreg class will not be used to constrain the subreg class.
         RC = nullptr;
-      }
-      else {
+      } else {
         assert(TRI->getMatchingSuperReg(RegA, SubRegB, MRI->getRegClass(RegB))
                && "tied subregister must be a truncation");
       }
@@ -1581,7 +1577,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     if (LIS) {
       LastCopyIdx = LIS->InsertMachineInstrInMaps(*PrevMI).getRegSlot();
 
-      if (TargetRegisterInfo::isVirtualRegister(RegA)) {
+      if (Register::isVirtualRegister(RegA)) {
         LiveInterval &LI = LIS->getInterval(RegA);
         VNInfo *VNI = LI.getNextValue(LastCopyIdx, LIS->getVNInfoAllocator());
         SlotIndex endIdx =
@@ -1601,8 +1597,7 @@ TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
     }
 
     // Make sure regA is a legal regclass for the SrcIdx operand.
-    if (TargetRegisterInfo::isVirtualRegister(RegA) &&
-        TargetRegisterInfo::isVirtualRegister(RegB))
+    if (Register::isVirtualRegister(RegA) && Register::isVirtualRegister(RegB))
       MRI->constrainRegClass(RegA, RC);
     MO.setReg(RegA);
     // The getMatchingSuper asserts guarantee that the register class projected
@@ -1744,8 +1739,8 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
         if (TiedPairs.size() == 1) {
           unsigned SrcIdx = TiedPairs[0].first;
           unsigned DstIdx = TiedPairs[0].second;
-          unsigned SrcReg = mi->getOperand(SrcIdx).getReg();
-          unsigned DstReg = mi->getOperand(DstIdx).getReg();
+          Register SrcReg = mi->getOperand(SrcIdx).getReg();
+          Register DstReg = mi->getOperand(DstIdx).getReg();
           if (SrcReg != DstReg &&
               tryInstructionTransform(mi, nmi, SrcIdx, DstIdx, Dist, false)) {
             // The tied operands have been eliminated or shifted further down
@@ -1803,9 +1798,8 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
 void TwoAddressInstructionPass::
 eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  if (MI.getOperand(0).getSubReg() ||
-      TargetRegisterInfo::isPhysicalRegister(DstReg) ||
+  Register DstReg = MI.getOperand(0).getReg();
+  if (MI.getOperand(0).getSubReg() || Register::isPhysicalRegister(DstReg) ||
       !(MI.getNumOperands() & 1)) {
     LLVM_DEBUG(dbgs() << "Illegal REG_SEQUENCE instruction:" << MI);
     llvm_unreachable(nullptr);
@@ -1821,7 +1815,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
   bool DefEmitted = false;
   for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2) {
     MachineOperand &UseMO = MI.getOperand(i);
-    unsigned SrcReg = UseMO.getReg();
+    Register SrcReg = UseMO.getReg();
     unsigned SubIdx = MI.getOperand(i+1).getImm();
     // Nothing needs to be inserted for undef operands.
     if (UseMO.isUndef())
@@ -1855,7 +1849,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
     DefEmitted = true;
 
     // Update LiveVariables' kill info.
-    if (LV && isKill && !TargetRegisterInfo::isPhysicalRegister(SrcReg))
+    if (LV && isKill && !Register::isPhysicalRegister(SrcReg))
       LV->replaceKillInstruction(SrcReg, MI, *CopyMI);
 
     LLVM_DEBUG(dbgs() << "Inserted: " << *CopyMI);
diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp
index 177bab32bccc..3289eff71336 100644
--- a/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/lib/CodeGen/UnreachableBlockElim.cpp
@@ -103,7 +103,8 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
   df_iterator_default_set<MachineBasicBlock*> Reachable;
   bool ModifiedPHI = false;
 
-  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
+  MMI = MMIWP ? &MMIWP->getMMI() : nullptr;
   MachineDominatorTree *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
   MachineLoopInfo *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
 
@@ -146,8 +147,14 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
   }
 
   // Actually remove the blocks now.
-  for (unsigned i = 0, e = DeadBlocks.size(); i != e; ++i)
+  for (unsigned i = 0, e = DeadBlocks.size(); i != e; ++i) {
+    // Remove any call site information for calls in the block.
+    for (auto &I : DeadBlocks[i]->instrs())
+      if (I.isCall(MachineInstr::IgnoreBundle))
+        DeadBlocks[i]->getParent()->eraseCallSiteInfo(&I);
+
     DeadBlocks[i]->eraseFromParent();
+  }
 
   // Cleanup PHI nodes.
   for (MachineFunction::iterator I = F.begin(), E = F.end(); I != E; ++I) {
@@ -167,8 +174,8 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
       if (phi->getNumOperands() == 3) {
         const MachineOperand &Input = phi->getOperand(1);
         const MachineOperand &Output = phi->getOperand(0);
-        unsigned InputReg = Input.getReg();
-        unsigned OutputReg = Output.getReg();
+        Register InputReg = Input.getReg();
+        Register OutputReg = Output.getReg();
         assert(Output.getSubReg() == 0 && "Cannot have output subregister");
         ModifiedPHI = true;
 
diff --git a/lib/CodeGen/ValueTypes.cpp b/lib/CodeGen/ValueTypes.cpp
index a911cdcbec9d..73b862d51c0f 100644
--- a/lib/CodeGen/ValueTypes.cpp
+++ b/lib/CodeGen/ValueTypes.cpp
@@ -115,8 +115,8 @@ std::string EVT::getEVTString() const {
   switch (V.SimpleTy) {
   default:
     if (isVector())
-      return "v" + utostr(getVectorNumElements()) +
-             getVectorElementType().getEVTString();
+      return (isScalableVector() ? "nxv" : "v") + utostr(getVectorNumElements())
+             + getVectorElementType().getEVTString();
     if (isInteger())
       return "i" + utostr(getSizeInBits());
     llvm_unreachable("Invalid EVT!");
@@ -144,6 +144,7 @@ std::string EVT::getEVTString() const {
   case MVT::v32i1:   return "v32i1";
   case MVT::v64i1:   return "v64i1";
   case MVT::v128i1:  return "v128i1";
+  case MVT::v256i1:  return "v256i1";
   case MVT::v512i1:  return "v512i1";
   case MVT::v1024i1: return "v1024i1";
   case MVT::v1i8:    return "v1i8";
@@ -157,6 +158,7 @@ std::string EVT::getEVTString() const {
   case MVT::v256i8:  return "v256i8";
   case MVT::v1i16:   return "v1i16";
   case MVT::v2i16:   return "v2i16";
+  case MVT::v3i16:   return "v3i16";
   case MVT::v4i16:   return "v4i16";
   case MVT::v8i16:   return "v8i16";
   case MVT::v16i16:  return "v16i16";
@@ -187,8 +189,11 @@ std::string EVT::getEVTString() const {
   case MVT::v1f32:   return "v1f32";
   case MVT::v2f32:   return "v2f32";
   case MVT::v2f16:   return "v2f16";
+  case MVT::v3f16:   return "v3f16";
   case MVT::v4f16:   return "v4f16";
   case MVT::v8f16:   return "v8f16";
+  case MVT::v16f16:  return "v16f16";
+  case MVT::v32f16:  return "v32f16";
   case MVT::v3f32:   return "v3f32";
   case MVT::v4f32:   return "v4f32";
   case MVT::v5f32:   return "v5f32";
@@ -205,6 +210,48 @@ std::string EVT::getEVTString() const {
   case MVT::v2f64:   return "v2f64";
   case MVT::v4f64:   return "v4f64";
   case MVT::v8f64:   return "v8f64";
+  case MVT::nxv1i1:  return "nxv1i1";
+  case MVT::nxv2i1:  return "nxv2i1";
+  case MVT::nxv4i1:  return "nxv4i1";
+  case MVT::nxv8i1:  return "nxv8i1";
+  case MVT::nxv16i1: return "nxv16i1";
+  case MVT::nxv32i1: return "nxv32i1";
+  case MVT::nxv1i8:  return "nxv1i8";
+  case MVT::nxv2i8:  return "nxv2i8";
+  case MVT::nxv4i8:  return "nxv4i8";
+  case MVT::nxv8i8:  return "nxv8i8";
+  case MVT::nxv16i8: return "nxv16i8";
+  case MVT::nxv32i8: return "nxv32i8";
+  case MVT::nxv1i16: return "nxv1i16";
+  case MVT::nxv2i16: return "nxv2i16";
+  case MVT::nxv4i16: return "nxv4i16";
+  case MVT::nxv8i16: return "nxv8i16";
+  case MVT::nxv16i16:return "nxv16i16";
+  case MVT::nxv32i16:return "nxv32i16";
+  case MVT::nxv1i32: return "nxv1i32";
+  case MVT::nxv2i32: return "nxv2i32";
+  case MVT::nxv4i32: return "nxv4i32";
+  case MVT::nxv8i32: return "nxv8i32";
+  case MVT::nxv16i32:return "nxv16i32";
+  case MVT::nxv32i32:return "nxv32i32";
+  case MVT::nxv1i64: return "nxv1i64";
+  case MVT::nxv2i64: return "nxv2i64";
+  case MVT::nxv4i64: return "nxv4i64";
+  case MVT::nxv8i64: return "nxv8i64";
+  case MVT::nxv16i64:return "nxv16i64";
+  case MVT::nxv32i64:return "nxv32i64";
+  case MVT::nxv2f16: return "nxv2f16";
+  case MVT::nxv4f16: return "nxv4f16";
+  case MVT::nxv8f16: return "nxv8f16";
+  case MVT::nxv1f32: return "nxv1f32";
+  case MVT::nxv2f32: return "nxv2f32";
+  case MVT::nxv4f32: return "nxv4f32";
+  case MVT::nxv8f32: return "nxv8f32";
+  case MVT::nxv16f32:return "nxv16f32";
+  case MVT::nxv1f64: return "nxv1f64";
+  case MVT::nxv2f64: return "nxv2f64";
+  case MVT::nxv4f64: return "nxv4f64";
+  case MVT::nxv8f64: return "nxv8f64";
   case MVT::Metadata:return "Metadata";
   case MVT::Untyped: return "Untyped";
   case MVT::exnref : return "exnref";
@@ -241,6 +288,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v32i1:   return VectorType::get(Type::getInt1Ty(Context), 32);
   case MVT::v64i1:   return VectorType::get(Type::getInt1Ty(Context), 64);
   case MVT::v128i1:  return VectorType::get(Type::getInt1Ty(Context), 128);
+  case MVT::v256i1:  return VectorType::get(Type::getInt1Ty(Context), 256);
   case MVT::v512i1:  return VectorType::get(Type::getInt1Ty(Context), 512);
   case MVT::v1024i1: return VectorType::get(Type::getInt1Ty(Context), 1024);
   case MVT::v1i8:    return VectorType::get(Type::getInt8Ty(Context), 1);
@@ -254,6 +302,7 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v256i8:  return VectorType::get(Type::getInt8Ty(Context), 256);
   case MVT::v1i16:   return VectorType::get(Type::getInt16Ty(Context), 1);
   case MVT::v2i16:   return VectorType::get(Type::getInt16Ty(Context), 2);
+  case MVT::v3i16:   return VectorType::get(Type::getInt16Ty(Context), 3);
   case MVT::v4i16:   return VectorType::get(Type::getInt16Ty(Context), 4);
   case MVT::v8i16:   return VectorType::get(Type::getInt16Ty(Context), 8);
   case MVT::v16i16:  return VectorType::get(Type::getInt16Ty(Context), 16);
@@ -282,8 +331,11 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v32i64:  return VectorType::get(Type::getInt64Ty(Context), 32);
   case MVT::v1i128:  return VectorType::get(Type::getInt128Ty(Context), 1);
   case MVT::v2f16:   return VectorType::get(Type::getHalfTy(Context), 2);
+  case MVT::v3f16:   return VectorType::get(Type::getHalfTy(Context), 3);
   case MVT::v4f16:   return VectorType::get(Type::getHalfTy(Context), 4);
   case MVT::v8f16:   return VectorType::get(Type::getHalfTy(Context), 8);
+  case MVT::v16f16:  return VectorType::get(Type::getHalfTy(Context), 16);
+  case MVT::v32f16:  return VectorType::get(Type::getHalfTy(Context), 32);
   case MVT::v1f32:   return VectorType::get(Type::getFloatTy(Context), 1);
   case MVT::v2f32:   return VectorType::get(Type::getFloatTy(Context), 2);
   case MVT::v3f32:   return VectorType::get(Type::getFloatTy(Context), 3);
@@ -302,8 +354,92 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::v2f64:   return VectorType::get(Type::getDoubleTy(Context), 2);
   case MVT::v4f64:   return VectorType::get(Type::getDoubleTy(Context), 4);
   case MVT::v8f64:   return VectorType::get(Type::getDoubleTy(Context), 8);
+  case MVT::nxv1i1:  
+    return VectorType::get(Type::getInt1Ty(Context), 1, /*Scalable=*/ true);
+  case MVT::nxv2i1:  
+    return VectorType::get(Type::getInt1Ty(Context), 2, /*Scalable=*/ true);
+  case MVT::nxv4i1:  
+    return VectorType::get(Type::getInt1Ty(Context), 4, /*Scalable=*/ true);
+  case MVT::nxv8i1:  
+    return VectorType::get(Type::getInt1Ty(Context), 8, /*Scalable=*/ true);
+  case MVT::nxv16i1: 
+    return VectorType::get(Type::getInt1Ty(Context), 16, /*Scalable=*/ true);
+  case MVT::nxv32i1: 
+    return VectorType::get(Type::getInt1Ty(Context), 32, /*Scalable=*/ true);
+  case MVT::nxv1i8:  
+    return VectorType::get(Type::getInt8Ty(Context), 1, /*Scalable=*/ true);
+  case MVT::nxv2i8:  
+    return VectorType::get(Type::getInt8Ty(Context), 2, /*Scalable=*/ true);
+  case MVT::nxv4i8:  
+    return VectorType::get(Type::getInt8Ty(Context), 4, /*Scalable=*/ true);
+  case MVT::nxv8i8:  
+    return VectorType::get(Type::getInt8Ty(Context), 8, /*Scalable=*/ true);
+  case MVT::nxv16i8: 
+    return VectorType::get(Type::getInt8Ty(Context), 16, /*Scalable=*/ true);
+  case MVT::nxv32i8: 
+    return VectorType::get(Type::getInt8Ty(Context), 32, /*Scalable=*/ true);
+  case MVT::nxv1i16: 
+    return VectorType::get(Type::getInt16Ty(Context), 1, /*Scalable=*/ true);
+  case MVT::nxv2i16: 
+    return VectorType::get(Type::getInt16Ty(Context), 2, /*Scalable=*/ true);
+  case MVT::nxv4i16: 
+    return VectorType::get(Type::getInt16Ty(Context), 4, /*Scalable=*/ true);
+  case MVT::nxv8i16: 
+    return VectorType::get(Type::getInt16Ty(Context), 8, /*Scalable=*/ true);
+  case MVT::nxv16i16:
+    return VectorType::get(Type::getInt16Ty(Context), 16, /*Scalable=*/ true);
+  case MVT::nxv32i16:
+    return VectorType::get(Type::getInt16Ty(Context), 32, /*Scalable=*/ true);
+  case MVT::nxv1i32: 
+    return VectorType::get(Type::getInt32Ty(Context), 1, /*Scalable=*/ true);
+  case MVT::nxv2i32: 
+    return VectorType::get(Type::getInt32Ty(Context), 2, /*Scalable=*/ true);
+  case MVT::nxv4i32: 
+    return VectorType::get(Type::getInt32Ty(Context), 4, /*Scalable=*/ true);
+  case MVT::nxv8i32: 
+    return VectorType::get(Type::getInt32Ty(Context), 8, /*Scalable=*/ true);
+  case MVT::nxv16i32:
+    return VectorType::get(Type::getInt32Ty(Context), 16,/*Scalable=*/ true);
+  case MVT::nxv32i32:
+    return VectorType::get(Type::getInt32Ty(Context), 32,/*Scalable=*/ true);
+  case MVT::nxv1i64: 
+    return VectorType::get(Type::getInt64Ty(Context), 1, /*Scalable=*/ true);
+  case MVT::nxv2i64: 
+    return VectorType::get(Type::getInt64Ty(Context), 2, /*Scalable=*/ true);
+  case MVT::nxv4i64: 
+    return VectorType::get(Type::getInt64Ty(Context), 4, /*Scalable=*/ true);
+  case MVT::nxv8i64: 
+    return VectorType::get(Type::getInt64Ty(Context), 8, /*Scalable=*/ true);
+  case MVT::nxv16i64:
+    return VectorType::get(Type::getInt64Ty(Context), 16, /*Scalable=*/ true);
+  case MVT::nxv32i64:
+    return VectorType::get(Type::getInt64Ty(Context), 32, /*Scalable=*/ true);
+  case MVT::nxv2f16: 
+    return VectorType::get(Type::getHalfTy(Context), 2, /*Scalable=*/ true);
+  case MVT::nxv4f16: 
+    return VectorType::get(Type::getHalfTy(Context), 4, /*Scalable=*/ true);
+  case MVT::nxv8f16: 
+    return VectorType::get(Type::getHalfTy(Context), 8, /*Scalable=*/ true);
+  case MVT::nxv1f32: 
+    return VectorType::get(Type::getFloatTy(Context), 1, /*Scalable=*/ true);
+  case MVT::nxv2f32: 
+    return VectorType::get(Type::getFloatTy(Context), 2, /*Scalable=*/ true);
+  case MVT::nxv4f32: 
+    return VectorType::get(Type::getFloatTy(Context), 4, /*Scalable=*/ true);
+  case MVT::nxv8f32: 
+    return VectorType::get(Type::getFloatTy(Context), 8, /*Scalable=*/ true);
+  case MVT::nxv16f32:
+    return VectorType::get(Type::getFloatTy(Context), 16, /*Scalable=*/ true);
+  case MVT::nxv1f64: 
+    return VectorType::get(Type::getDoubleTy(Context), 1, /*Scalable=*/ true);
+  case MVT::nxv2f64: 
+    return VectorType::get(Type::getDoubleTy(Context), 2, /*Scalable=*/ true);
+  case MVT::nxv4f64: 
+    return VectorType::get(Type::getDoubleTy(Context), 4, /*Scalable=*/ true);
+  case MVT::nxv8f64: 
+    return VectorType::get(Type::getDoubleTy(Context), 8, /*Scalable=*/ true);
   case MVT::Metadata: return Type::getMetadataTy(Context);
- }
+  }
 }
 
 /// Return the value type corresponding to the specified type.  This returns all
@@ -329,7 +465,8 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){
   case Type::VectorTyID: {
     VectorType *VTy = cast<VectorType>(Ty);
     return getVectorVT(
-      getVT(VTy->getElementType(), false), VTy->getNumElements());
+      getVT(VTy->getElementType(), /*HandleUnknown=*/ false),
+            VTy->getElementCount());
   }
   }
 }
@@ -345,8 +482,9 @@ EVT EVT::getEVT(Type *Ty, bool HandleUnknown){
     return getIntegerVT(Ty->getContext(), cast<IntegerType>(Ty)->getBitWidth());
   case Type::VectorTyID: {
     VectorType *VTy = cast<VectorType>(Ty);
-    return getVectorVT(Ty->getContext(), getEVT(VTy->getElementType(), false),
-                       VTy->getNumElements());
+    return getVectorVT(Ty->getContext(),
+                       getEVT(VTy->getElementType(), /*HandleUnknown=*/ false),
+                       VTy->getElementCount());
   }
   }
 }
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 4a06704a8876..5312e2eea96b 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -80,15 +80,14 @@ void VirtRegMap::grow() {
   Virt2SplitMap.resize(NumRegs);
 }
 
-void VirtRegMap::assignVirt2Phys(unsigned virtReg, MCPhysReg physReg) {
-  assert(TargetRegisterInfo::isVirtualRegister(virtReg) &&
-         TargetRegisterInfo::isPhysicalRegister(physReg));
-  assert(Virt2PhysMap[virtReg] == NO_PHYS_REG &&
+void VirtRegMap::assignVirt2Phys(Register virtReg, MCPhysReg physReg) {
+  assert(virtReg.isVirtual() && Register::isPhysicalRegister(physReg));
+  assert(Virt2PhysMap[virtReg.id()] == NO_PHYS_REG &&
          "attempt to assign physical register to already mapped "
          "virtual register");
   assert(!getRegInfo().isReserved(physReg) &&
          "Attempt to map virtReg to a reserved physReg");
-  Virt2PhysMap[virtReg] = physReg;
+  Virt2PhysMap[virtReg.id()] = physReg;
 }
 
 unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) {
@@ -99,46 +98,46 @@ unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) {
   return SS;
 }
 
-bool VirtRegMap::hasPreferredPhys(unsigned VirtReg) {
-  unsigned Hint = MRI->getSimpleHint(VirtReg);
-  if (!Hint)
+bool VirtRegMap::hasPreferredPhys(Register VirtReg) {
+  Register Hint = MRI->getSimpleHint(VirtReg);
+  if (!Hint.isValid())
     return false;
-  if (TargetRegisterInfo::isVirtualRegister(Hint))
+  if (Hint.isVirtual())
     Hint = getPhys(Hint);
   return getPhys(VirtReg) == Hint;
 }
 
-bool VirtRegMap::hasKnownPreference(unsigned VirtReg) {
+bool VirtRegMap::hasKnownPreference(Register VirtReg) {
   std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(VirtReg);
-  if (TargetRegisterInfo::isPhysicalRegister(Hint.second))
+  if (Register::isPhysicalRegister(Hint.second))
     return true;
-  if (TargetRegisterInfo::isVirtualRegister(Hint.second))
+  if (Register::isVirtualRegister(Hint.second))
     return hasPhys(Hint.second);
   return false;
 }
 
-int VirtRegMap::assignVirt2StackSlot(unsigned virtReg) {
-  assert(TargetRegisterInfo::isVirtualRegister(virtReg));
-  assert(Virt2StackSlotMap[virtReg] == NO_STACK_SLOT &&
+int VirtRegMap::assignVirt2StackSlot(Register virtReg) {
+  assert(virtReg.isVirtual());
+  assert(Virt2StackSlotMap[virtReg.id()] == NO_STACK_SLOT &&
          "attempt to assign stack slot to already spilled register");
   const TargetRegisterClass* RC = MF->getRegInfo().getRegClass(virtReg);
-  return Virt2StackSlotMap[virtReg] = createSpillSlot(RC);
+  return Virt2StackSlotMap[virtReg.id()] = createSpillSlot(RC);
 }
 
-void VirtRegMap::assignVirt2StackSlot(unsigned virtReg, int SS) {
-  assert(TargetRegisterInfo::isVirtualRegister(virtReg));
-  assert(Virt2StackSlotMap[virtReg] == NO_STACK_SLOT &&
+void VirtRegMap::assignVirt2StackSlot(Register virtReg, int SS) {
+  assert(virtReg.isVirtual());
+  assert(Virt2StackSlotMap[virtReg.id()] == NO_STACK_SLOT &&
          "attempt to assign stack slot to already spilled register");
   assert((SS >= 0 ||
           (SS >= MF->getFrameInfo().getObjectIndexBegin())) &&
          "illegal fixed frame index");
-  Virt2StackSlotMap[virtReg] = SS;
+  Virt2StackSlotMap[virtReg.id()] = SS;
 }
 
 void VirtRegMap::print(raw_ostream &OS, const Module*) const {
   OS << "********** REGISTER MAP **********\n";
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     if (Virt2PhysMap[Reg] != (unsigned)VirtRegMap::NO_PHYS_REG) {
       OS << '[' << printReg(Reg, TRI) << " -> "
          << printReg(Virt2PhysMap[Reg], TRI) << "] "
@@ -147,7 +146,7 @@ void VirtRegMap::print(raw_ostream &OS, const Module*) const {
   }
 
   for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    unsigned Reg = Register::index2VirtReg(i);
     if (Virt2StackSlotMap[Reg] != VirtRegMap::NO_STACK_SLOT) {
       OS << '[' << printReg(Reg, TRI) << " -> fi#" << Virt2StackSlotMap[Reg]
          << "] " << TRI->getRegClassName(MRI->getRegClass(Reg)) << "\n";
@@ -185,10 +184,10 @@ class VirtRegRewriter : public MachineFunctionPass {
   void rewrite();
   void addMBBLiveIns();
   bool readsUndefSubreg(const MachineOperand &MO) const;
-  void addLiveInsForSubRanges(const LiveInterval &LI, unsigned PhysReg) const;
+  void addLiveInsForSubRanges(const LiveInterval &LI, Register PhysReg) const;
   void handleIdentityCopy(MachineInstr &MI) const;
   void expandCopyBundle(MachineInstr &MI) const;
-  bool subRegLiveThrough(const MachineInstr &MI, unsigned SuperPhysReg) const;
+  bool subRegLiveThrough(const MachineInstr &MI, Register SuperPhysReg) const;
 
 public:
   static char ID;
@@ -265,7 +264,7 @@ bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) {
 }
 
 void VirtRegRewriter::addLiveInsForSubRanges(const LiveInterval &LI,
-                                             unsigned PhysReg) const {
+                                             Register PhysReg) const {
   assert(!LI.empty());
   assert(LI.hasSubRanges());
 
@@ -312,7 +311,7 @@ void VirtRegRewriter::addLiveInsForSubRanges(const LiveInterval &LI,
 // assignments.
 void VirtRegRewriter::addMBBLiveIns() {
   for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
-    unsigned VirtReg = TargetRegisterInfo::index2VirtReg(Idx);
+    Register VirtReg = Register::index2VirtReg(Idx);
     if (MRI->reg_nodbg_empty(VirtReg))
       continue;
     LiveInterval &LI = LIS->getInterval(VirtReg);
@@ -320,7 +319,7 @@ void VirtRegRewriter::addMBBLiveIns() {
       continue;
     // This is a virtual register that is live across basic blocks. Its
     // assigned PhysReg must be marked as live-in to those blocks.
-    unsigned PhysReg = VRM->getPhys(VirtReg);
+    Register PhysReg = VRM->getPhys(VirtReg);
     assert(PhysReg != VirtRegMap::NO_PHYS_REG && "Unmapped virtual register.");
 
     if (LI.hasSubRanges()) {
@@ -353,7 +352,7 @@ bool VirtRegRewriter::readsUndefSubreg(const MachineOperand &MO) const {
   if (MO.isUndef())
     return true;
 
-  unsigned Reg = MO.getReg();
+  Register Reg = MO.getReg();
   const LiveInterval &LI = LIS->getInterval(Reg);
   const MachineInstr &MI = *MO.getParent();
   SlotIndex BaseIndex = LIS->getInstructionIndex(MI);
@@ -469,7 +468,7 @@ void VirtRegRewriter::expandCopyBundle(MachineInstr &MI) const {
 /// \pre \p MI defines a subregister of a virtual register that
 /// has been assigned to \p SuperPhysReg.
 bool VirtRegRewriter::subRegLiveThrough(const MachineInstr &MI,
-                                        unsigned SuperPhysReg) const {
+                                        Register SuperPhysReg) const {
   SlotIndex MIIndex = LIS->getInstructionIndex(MI);
   SlotIndex BeforeMIUses = MIIndex.getBaseIndex();
   SlotIndex AfterMIDefs = MIIndex.getBoundaryIndex();
@@ -493,9 +492,9 @@ bool VirtRegRewriter::subRegLiveThrough(const MachineInstr &MI,
 
 void VirtRegRewriter::rewrite() {
   bool NoSubRegLiveness = !MRI->subRegLivenessEnabled();
-  SmallVector<unsigned, 8> SuperDeads;
-  SmallVector<unsigned, 8> SuperDefs;
-  SmallVector<unsigned, 8> SuperKills;
+  SmallVector<Register, 8> SuperDeads;
+  SmallVector<Register, 8> SuperDefs;
+  SmallVector<Register, 8> SuperKills;
 
   for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
        MBBI != MBBE; ++MBBI) {
@@ -513,10 +512,10 @@ void VirtRegRewriter::rewrite() {
         if (MO.isRegMask())
           MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
 
-        if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+        if (!MO.isReg() || !MO.getReg().isVirtual())
           continue;
-        unsigned VirtReg = MO.getReg();
-        unsigned PhysReg = VRM->getPhys(VirtReg);
+        Register VirtReg = MO.getReg();
+        Register PhysReg = VRM->getPhys(VirtReg);
         assert(PhysReg != VirtRegMap::NO_PHYS_REG &&
                "Instruction uses unmapped VirtReg");
         assert(!MRI->isReserved(PhysReg) && "Reserved register assignment");
@@ -562,7 +561,7 @@ void VirtRegRewriter::rewrite() {
 
           // PhysReg operands cannot have subregister indexes.
           PhysReg = TRI->getSubReg(PhysReg, SubReg);
-          assert(PhysReg && "Invalid SubReg for physical register");
+          assert(PhysReg.isValid() && "Invalid SubReg for physical register");
           MO.setSubReg(0);
         }
         // Rewrite. Note we could have used MachineOperand::substPhysReg(), but
diff --git a/lib/CodeGen/XRayInstrumentation.cpp b/lib/CodeGen/XRayInstrumentation.cpp
index 19c59e9542b4..119c3fd1ec7f 100644
--- a/lib/CodeGen/XRayInstrumentation.cpp
+++ b/lib/CodeGen/XRayInstrumentation.cpp
@@ -111,7 +111,7 @@ void XRayInstrumentation::replaceRetWithPatchableRet(
           MIB.add(MO);
         Terminators.push_back(&T);
         if (T.isCall())
-          MF.updateCallSiteInfo(&T);
+          MF.eraseCallSiteInfo(&T);
       }
     }
   }